kenan976431
/
bite

 
			
							import re, os
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from urllib.parse import urljoin
from pathlib import Path


class HWZSpider(scrapy.spiders.CrawlSpider):

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
    }

    name = "hwz_spider"
    # Crawl links limited to this domain
    domain_name = "https://forums.hardwarezone.com.sg"
    #start_urls = ["https://forums.hardwarezone.com.sg/infotech-clinics-1/"]
    main_subforums = ["/infotech-clinics-1/"]
    #main_subforums = ["/digital-entertainment-lifestyle-hub-225/",]
    show_all_threads = '?pp=200&daysprune=-1'
    show_all_pages = '?pp=50'
    filter_tokens = ['/misc.php?', '/external.php?', 'www.hardwarezone.com.sg']
    parents = {}

    def __init__(self):
        self.start_urls = [self.domain_name+sf for sf in self.main_subforums]

    def has_filter_tokens(self, url):
        for token in self.filter_tokens:
            if token in url:
                return True
        return False

    def parse(self, response):
        """Every URL processed in this function unless otherwise specified,
        yielded Requests also come here, duplicated Request urls are automatically handled by scrapy
        """
        subforum_list = response.xpath('//table[@class="tborder" and @width="100%"]/tbody//tr/td/div')

        if self.domain_name in response.url and response.url[-5-len(self.show_all_pages):-len(self.show_all_pages)] == '.html':
            self.scrape(response)

            # Thread page navigation
            thread_nav = response.xpath(
                '//div[contains(@id, "posts")]/following-sibling::table[1]')
            for thread_page_a in thread_nav.xpath('.//li'):
                href = thread_page_a.xpath('./a/@href').get()
                if href:
                    url = urljoin(self.domain_name, href.split('?')[0]+self.show_all_pages)
                    print('55',url)
                    yield Request(url)

        elif subforum_list and response.url[len(self.domain_name):] in self.main_subforums:
            # list of subforums
            ends_with = '"/" = substring(., string-length(.) - string-length("/") +1)'
            parent = response.url[len(self.domain_name)+1:-1]
            Path(parent).mkdir(parents=True, exist_ok=True)
            for subforum in subforum_list.xpath('./a/@href[starts-with(.,"/") and' + ends_with + ']').getall():
                self.parents[subforum.strip('/')] = parent
                url = urljoin(self.domain_name, subforum+self.show_all_threads)
                print('67',url)
                yield Request(url)
        else:
            # subforum page with list of threads
            threads_table = response.xpath('//table[contains(@id, "threadslist")]')  # List of threads
            threads_table_body = threads_table.xpath(
                './tbody[contains(@id, "threadbits_forum")]')
            for trow in threads_table_body.xpath('./tr//td[contains(@class,"alt1")]'):
                href = trow.xpath('.//a/@href').get()
                if href and not self.has_filter_tokens(href):
                    url = urljoin(self.domain_name, href+self.show_all_pages)
                    print('79',url)
                    yield Request(url)

            # Forum thread navigation
            forum_nav = response.xpath(
                '//table[contains(@id, "threadslist")]/following-sibling::table[1]')
            for forum_page_a in forum_nav.xpath('//li'):
                href = forum_page_a.xpath('./a/@href').get()
                if href:
                    url = urljoin(self.domain_name, href.split('?')[0]+self.show_all_threads)
                    print('89',url)
                    yield Request(url)

    def scrape(self, response):
        """ Scrape individual page in thread
        """
        print("Scraping from {}".format(response.url))
        subforum = re.search('/(.+?)/',response.url[len(self.domain_name):]).group(1)
        posts_table = response.xpath('//div[contains(@id, "posts")]')
        with open(os.path.join(self.parents[subforum],subforum), 'a+') as out:
            for post in posts_table.xpath('./div[contains(@class, "post-wrapper")]'):
                post_message = post.xpath('.//div[contains(@class, "post_message")]/text()').get()
                if post_message and post_message.strip() and '^M' not in post_message and len(post_message.split()) > 2:
                    out.write(post_message.strip()+'\n')

process = CrawlerProcess(settings={})
process.crawl(HWZSpider)
process.start()