Python scrapy.linkextractors.LinkExtractor() Examples
The following are 23
code examples of scrapy.linkextractors.LinkExtractor().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.linkextractors
, or try the search function
.
Example #1
Source File: generic.py From invana-bot with MIT License | 7 votes |
def __init__(self, restrict_xpaths=(), restrict_css=(), restrict_regex=(), allow_domains=(), link_extractor_cls=LinkExtractor, **kwargs): """ :param restrict_xpaths: list of xpaths for links Extraction. :param restrict_css: list of xpath for links extraction :param restrict_regex: list of regex patterns :param link_extractor_cls: defaults to scrapy link extractor :param allow_domains: defaults to the allowed domains of spider """ self.restrict_xpaths = restrict_xpaths self.restrict_css = restrict_css self.restrict_regex = restrict_regex self.allow_domains = allow_domains self.link_extractor_cls = link_extractor_cls
Example #2
Source File: single.py From invana-bot with MIT License | 6 votes |
def generate_spider_kwargs(self): extractor = LinkExtractor() rules = [ Rule(extractor, follow=True) # TODO - add regex types of needed. ] print(self.manifest) spider_kwargs = { "start_urls": self.spider_config['start_urls'], "allowed_domains": [], "rules": rules, "spider_config": self.spider_config, "manifest": self.manifest, "context": self.context, # "default_storage": } spider_kwargs.update(self.extra_arguments) return spider_kwargs
Example #3
Source File: url.py From hoaxy-backend with GNU General Public License v3.0 | 6 votes |
def __init__(self, domains, urls, *args, **kwargs): """Constructor for PageSpider. Parameters ---------- domains : list A list of domains for the site. urls : list A list of URLs of the site. href_xpaths : list A list of XPATH expression indicating the ancestors of `<a>` element. url_regex : string URL pattern regular expression. If you use this spider to store item into database, additional keywords are required: platform_id : int The id of a platform instance. session : object An instance of SQLAlchemy session. """ self.session = kwargs.pop('session', None) self.platform_id = kwargs.pop('platform_id', None) self.href_xpaths = kwargs.pop('href_xpaths', ()) self.url_regex = kwargs.pop('url_regex', None) self.start_urls = urls self.allowed_domains = domains self.link_extractor = LinkExtractor( allow_domains=self.allowed_domains, restrict_xpaths=self.href_xpaths, unique=True) super(PageSpider, self).__init__(*args, **kwargs)
Example #4
Source File: samakal.py From corpus-builder with MIT License | 6 votes |
def request_index(self, response): categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$'))) if self.category is not None: if self.category in categories: categories = [self.category] else: raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories)) date_processing = self.start_date while date_processing <= self.end_date: for category in categories: # redifining the rule again according to the specific date url SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',), restrict_xpaths=('//div[@class="main-body"]')), callback="parse_content", follow=True),) super(SamakalSpider, self)._compile_rules() # http://bangla.samakal.net/-education/2016/06/01 url = 'http://bangla.samakal.net/{0}/{1}'.format( category, date_processing.strftime('%Y/%m/%d') ) yield self.make_requests_from_url(url) date_processing += datetime.timedelta(days=1)
Example #5
Source File: mirror_spider.py From wayback-machine-scraper with ISC License | 6 votes |
def __init__(self, domains, directory, allow=(), deny=(), unix=False): self.directory = directory self.unix = unix self.rules = ( Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'), ) # parse the allowed domains and start urls self.allowed_domains = [] self.start_urls = [] for domain in domains: url_parts = domain.split('://') unqualified_url = url_parts[-1] url_scheme = url_parts[0] if len(url_parts) > 1 else 'http' full_url = '{0}://{1}'.format(url_scheme, unqualified_url) bare_domain = unqualified_url.split('/')[0] self.allowed_domains.append(bare_domain) self.start_urls.append(full_url) super().__init__()
Example #6
Source File: link.py From scrapy-bench with MIT License | 6 votes |
def main(): url = 'http://scrapinghub.com/' link_extractor = LinkExtractor() total = 0 time = 0 tar = tarfile.open("sites.tar.gz") for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() start = timer() response = HtmlResponse(url=url, body=html, encoding='utf8') links = link_extractor.extract_links(response) end = timer() total = total + len(links) time = time + end - start print("\nTotal number of links extracted = {0}".format(total)) print("Time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} links/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
Example #7
Source File: spider.py From collectors with MIT License | 6 votes |
def __init__(self, conf=None, conn=None): # Save conf/conn self.conf = conf self.conn = conn # Make urls self.start_urls = [ 'http://www.pfizer.com/research/clinical_trials/find_a_trial?recr=0', ] # Make rules self.rules = [ Rule(LinkExtractor( allow=r'find_a_trial/NCT\d+', ), callback=parse_record), Rule(LinkExtractor( allow=r'page=\d+', )), ] # Inherit parent super(Spider, self).__init__()
Example #8
Source File: spider.py From collectors with MIT License | 6 votes |
def __init__(self, conf=None, conn=None): # Save conf/conn self.conf = conf self.conn = conn # Make urls self.start_urls = [ 'http://www.takedaclinicaltrials.com/browse/?protocol_id=', ] # Make rules self.rules = [ Rule(LinkExtractor( allow=r'browse/summary/', ), callback=parse_record), Rule(LinkExtractor( allow=r'browse', )), ] # Inherit parent super(Spider, self).__init__()
Example #9
Source File: ip66.py From aox_proxy_pool with Apache License 2.0 | 6 votes |
def parse(self, response): link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex') links = link.extract_links(response) for _link in links: # yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list) yield scrapy.Request(_link.url, callback=self.parse_list)
Example #10
Source File: spider.py From collectors with MIT License | 5 votes |
def __init__(self, conf=None, conn=None, page_from=None, page_to=None): # Save conf/conn self.conf = conf self.conn = conn # Default values if page_from is None: page_from = '1' if page_to is None: page_to = '1' # Make start urls self.start_urls = _make_start_urls( prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi', page_from=page_from) # Make rules self.rules = [ Rule(LinkExtractor( allow=r'cgi-open-bin/ctr_e/ctr_view.cgi', ), callback=parse_record), Rule(LinkExtractor( allow=r'page=\d+', process_value=partial(_process_url, page_from, page_to), )), ] # Inherit parent super(Spider, self).__init__() # Internal
Example #11
Source File: spider.py From collectors with MIT License | 5 votes |
def __init__(self, conf=None, conn=None, http_user=None, http_pass=None): # Save conf/conn self.conf = conf self.conn = conn # Save creadentials self.http_user = http_user self.http_pass = http_pass # Make urls self.start_urls = [ 'http://apps.who.int/trialsearch/crawl/crawl0.aspx', ] # Make rules self.rules = [ Rule(LinkExtractor( allow=r'trialsearch/Trial\d+\.aspx\?trialid=.+', ), callback=parse_record), Rule(LinkExtractor( allow=r'trialsearch/crawl/crawl\d+\.aspx', )), ] # Inherit parent super(Spider, self).__init__()
Example #12
Source File: spider.py From collectors with MIT License | 5 votes |
def __init__(self, conf=None, conn=None, date_from=None, date_to=None): # Save conf/conn self.conf = conf self.conn = conn # Make start urls self.start_urls = _make_start_urls( prefix='http://www.anzctr.org.au/TrialSearch.aspx', date_from=date_from, date_to=date_to) # Make rules self.rules = [ Rule(LinkExtractor( allow=r'Trial/Registration/TrialReview.aspx', process_value=lambda value: value.replace('http', 'https', 1), ), callback=parse_record), Rule(LinkExtractor( allow=r'page=\d+', )), ] # Inherit parent super(Spider, self).__init__() # Internal
Example #13
Source File: followall.py From scrapy-bench with MIT License | 5 votes |
def __init__(self, book_url=None, **kw): super(FollowAllSpider, self).__init__(**kw) url = book_url if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set() self.previtem = 0 self.items = 0 self.timesec = datetime.datetime.utcnow()
Example #14
Source File: spider.py From collectors with MIT License | 5 votes |
def __init__(self, conf=None, conn=None, date_from=None, date_to=None): # Save conf/conn self.conf = conf self.conn = conn # Make start urls self.start_urls = _make_start_urls( prefix='http://www.isrctn.com/search', date_from=date_from, date_to=date_to) # Make rules self.rules = [ Rule(LinkExtractor( allow=r'ISRCTN\d+', ), callback=parse_record), Rule(LinkExtractor( allow=r'page=\d+', )), ] # Inherit parent super(Spider, self).__init__() # Internal
Example #15
Source File: broadspider.py From scrapy-bench with MIT License | 5 votes |
def __init__(self, **kw): super(BroadBenchSpider, self).__init__(**kw) self.link_extractor = LinkExtractor() self.cookies_seen = set() self.previtem = 0 self.items = 0 self.timesec = datetime.datetime.utcnow() self.start_urls = [ 'http://domain{}:{}/index.html'.format(i, self.port) for i in range(1, self.n_domains + 1)]
Example #16
Source File: spiders.py From autologin with Apache License 2.0 | 5 votes |
def __init__(self, url, credentials, *args, **kwargs): self.credentials = credentials self.start_urls = [url] self.link_extractor = LinkExtractor(allow_domains=[get_domain(url)]) self.found_login = False self.found_registration = False super(FormSpider, self).__init__(*args, **kwargs)
Example #17
Source File: main.py From python-examples with MIT License | 5 votes |
def parse_item(self, response): print('parse item url:', response.url) self.test_status('parse_item()', response) # The list of items that are found on the particular page items = [] res = Selector(response) self.append(self.resp_log_file, str(response)) # Only extract canonicalized and unique links (with respect to the current page) links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
Example #18
Source File: gov.py From openslack-crawler with Apache License 2.0 | 5 votes |
def Layer01_Parse(self, response): item = Layer01_Item() for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d\.html')).extract_links(response): url = i.url text = i.text item['year'] = url[-12:-8] item['name'] = text item['code'] = url[-7:-5] yield item yield Request(url, callback=self.Layer02_Parse)
Example #19
Source File: gov.py From openslack-crawler with Apache License 2.0 | 5 votes |
def Layer02_Parse(self, response): text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\ [0].extract() item = Layer02_Item() item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:] for code, name in re.findall(r'href="\d\d/(\d{4})\.html">([^\d]+?)</a>', text): item['name'] = name item['code'] = code yield item for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d{4}\.html')).extract_links(response): url = i.url text = i.text yield Request(url, callback=self.Layer03_Parse)
Example #20
Source File: gov.py From openslack-crawler with Apache License 2.0 | 5 votes |
def Layer03_Parse(self, response): text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\ [0].extract() item = Layer03_Item() item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:] for code, name in re.findall(r'href="\d\d/(\d{6})\.html">([^\d]+?)</a>', text): item['name'] = name item['code'] = code yield item for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d{6}\.html')).extract_links(response): url = i.url text = i.text yield Request(url, callback=self.Layer04_Parse)
Example #21
Source File: gov.py From openslack-crawler with Apache License 2.0 | 5 votes |
def Layer04_Parse(self, response): text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\ [0].extract() item = Layer04_Item() item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:] for code, name in re.findall(r'href="\d\d/(\d{9}).html">([^\d]+?)</a>', text): item['name'] = name item['code'] = code yield item for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d\d/\d{9}\.html')).extract_links(response): url = i.url text = i.text yield Request(url, callback=self.Layer05_Parse)
Example #22
Source File: url.py From hoaxy-backend with GNU General Public License v3.0 | 5 votes |
def __init__(self, domains, urls, *args, **kwargs): """Constructor for SiteSpider. Parameters ---------- domains : list A list of domains for the site. urls : list A list of sitemap URLS of the site. href_xpaths : list A list of XPATH expression indicating the ancestors of `<a>` element. url_regex : string URL pattern regular expression. If you use this spider to store item into database, additional keywords are required: platform_id : int The id of a platform instance. session : object An instance of SQLAlchemy session. """ self.session = kwargs.pop('session', None) self.platform_id = kwargs.pop('platform_id', None) self.url_regex = kwargs.pop('url_regex', None) self.href_xpaths = kwargs.pop('href_xpaths', ()) self.start_urls = urls self.allowed_domains = domains self.rules = (Rule( LinkExtractor( allow_domains=self.allowed_domains, restrict_xpaths=self.href_xpaths, unique=True), callback="parse_item", follow=True),) super(SiteSpider, self).__init__(*args, **kwargs)
Example #23
Source File: crawlpy_spider.py From crawlpy with MIT License | 4 votes |
def parse(self, response): """ Scrapy parse callback """ # Get current nesting level curr_depth = response.meta.get('depth', 1) if self.config['login']['enabled']: curr_depth = curr_depth - 1 # Do not count the login page as nesting depth # Store to disk? if self.config['store']['enabled']: path = response.url.replace(os.sep, '--') # Replace directory separator path = self.config['store']['path'] + os.sep + path with open(path, 'wb') as fpointer: fpointer.write(response.body) # Yield current url item item = CrawlpyItem() item['url'] = response.url item['status'] = response.status item['depth'] = curr_depth item['referer'] = response.meta.get('referer', '') yield item # Get all links from the current page links = LinkExtractor().extract_links(response) # Iterate all found links and crawl them for link in links: deny = False # Check requests to be ignored for ignore in self.config['ignores']: if (ignore in link.url) or (ignore.lower() in link.url.lower()): # Ignore pattern found, stop looking into other patterns deny = True break # [NO] Max depth exceeded if curr_depth >= self.max_depth: logging.info('[Not Crawling] Current depth (' + curr_depth + ') exceeds max depth (' + self.max_depth + ')') pass # [NO] Duplicate URL elif link.url in self.duplicates: logging.info('[Not Crawling] Url already crawled: ' + link.url) pass # [NO] URL denied elif deny: logging.info('[Not Crawling] Url denied (pattern: "' + ignore + '"): ' + link.url) pass # [OK] Crawl! else: self.duplicates.append(link.url) yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url})