Python scrapy.linkextractors.LinkExtractor() Examples

The following are 23 code examples of scrapy.linkextractors.LinkExtractor(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.linkextractors , or try the search function .
Example #1
Source File: generic.py    From invana-bot with MIT License 7 votes vote down vote up
def __init__(self,
                 restrict_xpaths=(),
                 restrict_css=(),
                 restrict_regex=(),
                 allow_domains=(),
                 link_extractor_cls=LinkExtractor, **kwargs):
        """

        :param restrict_xpaths: list of xpaths for links Extraction.
        :param restrict_css: list of xpath for links extraction
        :param restrict_regex: list of regex patterns
        :param link_extractor_cls: defaults to scrapy link extractor
        :param allow_domains: defaults to the allowed domains of spider
        """
        self.restrict_xpaths = restrict_xpaths
        self.restrict_css = restrict_css
        self.restrict_regex = restrict_regex
        self.allow_domains = allow_domains
        self.link_extractor_cls = link_extractor_cls 
Example #2
Source File: single.py    From invana-bot with MIT License 6 votes vote down vote up
def generate_spider_kwargs(self):
        extractor = LinkExtractor()
        rules = [
            Rule(extractor, follow=True)  # TODO - add regex types of needed.
        ]
        print(self.manifest)
        spider_kwargs = {
            "start_urls": self.spider_config['start_urls'],
            "allowed_domains": [],
            "rules": rules,
            "spider_config": self.spider_config,
            "manifest": self.manifest,
            "context": self.context,
            # "default_storage":
        }
        spider_kwargs.update(self.extra_arguments)
        return spider_kwargs 
Example #3
Source File: url.py    From hoaxy-backend with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for PageSpider.

        Parameters
        ----------
        domains : list
            A list of domains for the site.
        urls : list
            A list of URLs of the site.
        href_xpaths : list
            A list of XPATH expression indicating the ancestors of `<a>`
            element.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.
        """
        self.session = kwargs.pop('session', None)
        self.platform_id = kwargs.pop('platform_id', None)
        self.href_xpaths = kwargs.pop('href_xpaths', ())
        self.url_regex = kwargs.pop('url_regex', None)
        self.start_urls = urls
        self.allowed_domains = domains
        self.link_extractor = LinkExtractor(
            allow_domains=self.allowed_domains,
            restrict_xpaths=self.href_xpaths,
            unique=True)
        super(PageSpider, self).__init__(*args, **kwargs) 
Example #4
Source File: samakal.py    From corpus-builder with MIT License 6 votes vote down vote up
def request_index(self, response):
        categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$')))

        if self.category is not None:
            if self.category in categories:
                categories = [self.category]
            else:
                raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories))

        date_processing = self.start_date
        while date_processing <= self.end_date:
            for category in categories:
                # redifining the rule again according to the specific date url
                SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',),
                                                          restrict_xpaths=('//div[@class="main-body"]')),
                                            callback="parse_content", follow=True),)
                super(SamakalSpider, self)._compile_rules()
                # http://bangla.samakal.net/-education/2016/06/01 
                url = 'http://bangla.samakal.net/{0}/{1}'.format(
                    category,
                    date_processing.strftime('%Y/%m/%d')
                )
                yield self.make_requests_from_url(url)
            date_processing += datetime.timedelta(days=1) 
Example #5
Source File: mirror_spider.py    From wayback-machine-scraper with ISC License 6 votes vote down vote up
def __init__(self, domains, directory, allow=(), deny=(), unix=False):
        self.directory = directory
        self.unix = unix
        self.rules = (
            Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'),
        )

        # parse the allowed domains and start urls
        self.allowed_domains = []
        self.start_urls = []
        for domain in domains:
            url_parts = domain.split('://')
            unqualified_url = url_parts[-1]
            url_scheme = url_parts[0] if len(url_parts) > 1 else 'http'
            full_url = '{0}://{1}'.format(url_scheme, unqualified_url)
            bare_domain = unqualified_url.split('/')[0]
            self.allowed_domains.append(bare_domain)
            self.start_urls.append(full_url)

        super().__init__() 
Example #6
Source File: link.py    From scrapy-bench with MIT License 6 votes vote down vote up
def main():
    url = 'http://scrapinghub.com/'
    link_extractor = LinkExtractor()
    total = 0
    time = 0
    tar = tarfile.open("sites.tar.gz")
    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()

        start = timer()

        response = HtmlResponse(url=url, body=html, encoding='utf8')
        links = link_extractor.extract_links(response)

        end = timer()

        total = total + len(links)
        time = time + end - start

    print("\nTotal number of links extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} links/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time)))) 
Example #7
Source File: spider.py    From collectors with MIT License 6 votes vote down vote up
def __init__(self, conf=None, conn=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make urls
        self.start_urls = [
            'http://www.pfizer.com/research/clinical_trials/find_a_trial?recr=0',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'find_a_trial/NCT\d+',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__() 
Example #8
Source File: spider.py    From collectors with MIT License 6 votes vote down vote up
def __init__(self, conf=None, conn=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make urls
        self.start_urls = [
            'http://www.takedaclinicaltrials.com/browse/?protocol_id=',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'browse/summary/',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'browse',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__() 
Example #9
Source File: ip66.py    From aox_proxy_pool with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex')
        links = link.extract_links(response)
        for _link in links:
            # yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list)
            yield scrapy.Request(_link.url, callback=self.parse_list) 
Example #10
Source File: spider.py    From collectors with MIT License 5 votes vote down vote up
def __init__(self, conf=None, conn=None, page_from=None, page_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Default values
        if page_from is None:
            page_from = '1'
        if page_to is None:
            page_to = '1'

        # Make start urls
        self.start_urls = _make_start_urls(
                prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi',
                page_from=page_from)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'cgi-open-bin/ctr_e/ctr_view.cgi',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
                process_value=partial(_process_url, page_from, page_to),
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal 
Example #11
Source File: spider.py    From collectors with MIT License 5 votes vote down vote up
def __init__(self, conf=None, conn=None, http_user=None, http_pass=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Save creadentials
        self.http_user = http_user
        self.http_pass = http_pass

        # Make urls
        self.start_urls = [
            'http://apps.who.int/trialsearch/crawl/crawl0.aspx',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'trialsearch/Trial\d+\.aspx\?trialid=.+',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'trialsearch/crawl/crawl\d+\.aspx',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__() 
Example #12
Source File: spider.py    From collectors with MIT License 5 votes vote down vote up
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make start urls
        self.start_urls = _make_start_urls(
            prefix='http://www.anzctr.org.au/TrialSearch.aspx',
            date_from=date_from, date_to=date_to)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'Trial/Registration/TrialReview.aspx',
                process_value=lambda value: value.replace('http', 'https', 1),
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal 
Example #13
Source File: followall.py    From scrapy-bench with MIT License 5 votes vote down vote up
def __init__(self, book_url=None, **kw):
        super(FollowAllSpider, self).__init__(**kw)

        url = book_url
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow() 
Example #14
Source File: spider.py    From collectors with MIT License 5 votes vote down vote up
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make start urls
        self.start_urls = _make_start_urls(
                prefix='http://www.isrctn.com/search',
                date_from=date_from, date_to=date_to)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'ISRCTN\d+',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal 
Example #15
Source File: broadspider.py    From scrapy-bench with MIT License 5 votes vote down vote up
def __init__(self, **kw):
        super(BroadBenchSpider, self).__init__(**kw)

        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow()
        self.start_urls = [
            'http://domain{}:{}/index.html'.format(i, self.port) for i in range(1, self.n_domains + 1)] 
Example #16
Source File: spiders.py    From autologin with Apache License 2.0 5 votes vote down vote up
def __init__(self, url, credentials, *args, **kwargs):
        self.credentials = credentials
        self.start_urls = [url]
        self.link_extractor = LinkExtractor(allow_domains=[get_domain(url)])
        self.found_login = False
        self.found_registration = False
        super(FormSpider, self).__init__(*args, **kwargs) 
Example #17
Source File: main.py    From python-examples with MIT License 5 votes vote down vote up
def parse_item(self, response):
        print('parse item url:', response.url)

        self.test_status('parse_item()', response)

        # The list of items that are found on the particular page
        items = []
        res = Selector(response)
        self.append(self.resp_log_file, str(response))
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) 
Example #18
Source File: gov.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def Layer01_Parse(self, response):

        item = Layer01_Item()
        for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d\.html')).extract_links(response):
            url = i.url
            text = i.text
            item['year'] = url[-12:-8]
            item['name'] = text
            item['code'] = url[-7:-5]
            yield item
            yield Request(url, callback=self.Layer02_Parse) 
Example #19
Source File: gov.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def Layer02_Parse(self, response):
        text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
               [0].extract()
        item = Layer02_Item()
        item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
        for code, name in re.findall(r'href="\d\d/(\d{4})\.html">([^\d]+?)</a>', text):
            item['name'] = name
            item['code'] = code
            yield item
        for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d{4}\.html')).extract_links(response):
            url = i.url
            text = i.text
            yield Request(url, callback=self.Layer03_Parse) 
Example #20
Source File: gov.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def Layer03_Parse(self, response):
        text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
               [0].extract()
        item = Layer03_Item()
        item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
        for code, name in re.findall(r'href="\d\d/(\d{6})\.html">([^\d]+?)</a>', text):
            item['name'] = name
            item['code'] = code
            yield item
        for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d{6}\.html')).extract_links(response):
            url = i.url
            text = i.text
            yield Request(url, callback=self.Layer04_Parse) 
Example #21
Source File: gov.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def Layer04_Parse(self, response):
        text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
               [0].extract()
        item = Layer04_Item()
        item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
        for code, name in re.findall(r'href="\d\d/(\d{9}).html">([^\d]+?)</a>', text):
            item['name'] = name
            item['code'] = code
            yield item
        for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d\d/\d{9}\.html')).extract_links(response):
            url = i.url
            text = i.text
            yield Request(url, callback=self.Layer05_Parse) 
Example #22
Source File: url.py    From hoaxy-backend with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for SiteSpider.

        Parameters
        ----------
        domains : list
            A list of domains for the site.
        urls : list
            A list of sitemap URLS of the site.
        href_xpaths : list
            A list of XPATH expression indicating the ancestors of `<a>`
            element.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.
        """
        self.session = kwargs.pop('session', None)
        self.platform_id = kwargs.pop('platform_id', None)
        self.url_regex = kwargs.pop('url_regex', None)
        self.href_xpaths = kwargs.pop('href_xpaths', ())
        self.start_urls = urls
        self.allowed_domains = domains
        self.rules = (Rule(
            LinkExtractor(
                allow_domains=self.allowed_domains,
                restrict_xpaths=self.href_xpaths,
                unique=True),
            callback="parse_item",
            follow=True),)

        super(SiteSpider, self).__init__(*args, **kwargs) 
Example #23
Source File: crawlpy_spider.py    From crawlpy with MIT License 4 votes vote down vote up
def parse(self, response):
        """
        Scrapy parse callback
        """

        # Get current nesting level
        curr_depth = response.meta.get('depth', 1)
        if self.config['login']['enabled']:
            curr_depth = curr_depth - 1 # Do not count the login page as nesting depth

        # Store to disk?
        if self.config['store']['enabled']:
            path = response.url.replace(os.sep, '--')   # Replace directory separator
            path = self.config['store']['path'] + os.sep + path
            with open(path, 'wb') as fpointer:
                fpointer.write(response.body)

        # Yield current url item
        item = CrawlpyItem()
        item['url'] = response.url
        item['status'] = response.status
        item['depth'] = curr_depth
        item['referer'] = response.meta.get('referer', '')
        yield item



        # Get all links from the current page
        links = LinkExtractor().extract_links(response)

        # Iterate all found links and crawl them
        for link in links:
            deny = False

            # Check requests to be ignored
            for ignore in self.config['ignores']:
                if  (ignore in link.url) or (ignore.lower() in link.url.lower()):
                    # Ignore pattern found, stop looking into other patterns
                    deny = True
                    break


            # [NO] Max depth exceeded
            if curr_depth >= self.max_depth:
                logging.info('[Not Crawling] Current depth (' + curr_depth + ') exceeds max depth (' + self.max_depth + ')')
                pass
            # [NO] Duplicate URL
            elif link.url in self.duplicates:
                logging.info('[Not Crawling] Url already crawled: ' + link.url)
                pass
            # [NO] URL denied
            elif deny:
                logging.info('[Not Crawling] Url denied (pattern: "' + ignore + '"): ' + link.url)
                pass
            # [OK] Crawl!
            else:
                self.duplicates.append(link.url)
                yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url})