Python scrapy.spiders.Rule() Examples
The following are 3
code examples of scrapy.spiders.Rule().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.spiders
, or try the search function
.
Example #1
Source File: RedScanner.py From hack4career with Apache License 2.0 | 6 votes |
def parse_body(self, response): global crawl_counter try: crawl_counter += 1 if crawl_counter > max_crawl_limit: os._exit(0) matches = rules.match(data=response.body) if matches: for match in matches: print "[+] URL:", response.request.url, "Matched YARA Rule:", match if debug: print "[*] Matched body response:", response.body.decode("utf-8") txt = "URL: " + response.request.url + " Matched YARA Rule: " + str(match) log(txt) except Exception as e: # if debug: print str(e)
Example #2
Source File: samakal.py From corpus-builder with MIT License | 6 votes |
def request_index(self, response): categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$'))) if self.category is not None: if self.category in categories: categories = [self.category] else: raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories)) date_processing = self.start_date while date_processing <= self.end_date: for category in categories: # redifining the rule again according to the specific date url SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',), restrict_xpaths=('//div[@class="main-body"]')), callback="parse_content", follow=True),) super(SamakalSpider, self)._compile_rules() # http://bangla.samakal.net/-education/2016/06/01 url = 'http://bangla.samakal.net/{0}/{1}'.format( category, date_processing.strftime('%Y/%m/%d') ) yield self.make_requests_from_url(url) date_processing += datetime.timedelta(days=1)
Example #3
Source File: url.py From hoaxy-backend with GNU General Public License v3.0 | 5 votes |
def __init__(self, domains, urls, *args, **kwargs): """Constructor for SiteSpider. Parameters ---------- domains : list A list of domains for the site. urls : list A list of sitemap URLS of the site. href_xpaths : list A list of XPATH expression indicating the ancestors of `<a>` element. url_regex : string URL pattern regular expression. If you use this spider to store item into database, additional keywords are required: platform_id : int The id of a platform instance. session : object An instance of SQLAlchemy session. """ self.session = kwargs.pop('session', None) self.platform_id = kwargs.pop('platform_id', None) self.url_regex = kwargs.pop('url_regex', None) self.href_xpaths = kwargs.pop('href_xpaths', ()) self.start_urls = urls self.allowed_domains = domains self.rules = (Rule( LinkExtractor( allow_domains=self.allowed_domains, restrict_xpaths=self.href_xpaths, unique=True), callback="parse_item", follow=True),) super(SiteSpider, self).__init__(*args, **kwargs)