Python scrapy.utils.project.get_project_settings() Examples
The following are 30
code examples of scrapy.utils.project.get_project_settings().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.utils.project
, or try the search function
.
Example #1
Source File: run_spider.py From IPProxyTool with MIT License | 8 votes |
def runspider(name): configure_logging(install_root_handler=False) logging.basicConfig( filename='log/%s.log' % name, format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG ) process = CrawlerProcess(get_project_settings()) try: logging.info('runspider start spider:%s' % name) process.crawl(name) process.start() except Exception as e: logging.exception('runspider spider:%s exception:%s' % (name, e)) logging.debug('finish this spider:%s\n\n' % name)
Example #2
Source File: runner.py From In2ItChicago with GNU General Public License v3.0 | 7 votes |
def run(): config.connect_to_client() print('Running event processor...') crawlerProcess = CrawlerProcess(get_project_settings()) settings = project.get_project_settings() spider_loader = spiderloader.SpiderLoader.from_settings(settings) spiders = spider_loader.list() classes = [s for s in (spider_loader.load(name) for name in spiders if config.spider_name == None or name == config.spider_name) if s.enabled] crawlerProcess = CrawlerProcess(get_project_settings()) for spider_class in classes: crawlerProcess.crawl(spider_class) crawlerProcess.start() crawlerProcess.join() print('Event processor completed') session = HttpUtils.get_session() events = session.get(config.get_events, params = {}) if len(events.json()) > 0: print('Data retrieved successfully') else: print('No data retrieved')
Example #3
Source File: spider.py From stock with Apache License 2.0 | 6 votes |
def __init__(self): self.spider = HqSpider() self.crawler = crawler = Crawler(get_project_settings()) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(self.spider) dispatcher.connect(self._dont_close_me, signals.spider_idle) self.thread = None self._started = False self._stopped = False
Example #4
Source File: real_time_analysis.py From jd_analysis with GNU Lesser General Public License v3.0 | 6 votes |
def runspider(spargs): url = spargs.get('url') name = spargs.get('name', 'jd') if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % name, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.ERROR ) print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES'] process = CrawlerProcess(get_project_settings()) start_time = time.time() try: logging.info('进入爬虫') process.crawl(name, **spargs) process.start() except Exception, e: process.stop() logging.error("url:%s, errorMsg:%s" % (url, e.message))
Example #5
Source File: run_spider.py From jd_analysis with GNU Lesser General Public License v3.0 | 6 votes |
def runspider(spargs): url = spargs.get('url') name = spargs.get('name', 'jd') guid = spargs.get('guid') product_id = spargs.get('product_id') if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % name, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.ERROR ) print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES'] process = CrawlerProcess(get_project_settings()) start_time = time.time() try: logging.info('进入爬虫') process.crawl(name, **spargs) process.start() except Exception, e: process.stop() logging.error("url:%s, errorMsg:%s" % (url, e.message))
Example #6
Source File: __init__.py From fooltrader with MIT License | 6 votes |
def crawl(spider, setting): process = CrawlerProcess({**get_project_settings(), **setting}) process.crawl(spider) process.start()
Example #7
Source File: parser.py From Gerapy with MIT License | 6 votes |
def get_follow_requests_and_items(project_path, spider_name, args): """ get follows :param project_path: :param spider_name: :param args: :return: """ work_cwd = os.getcwd() try: os.chdir(project_path) settings = get_project_settings() check_deprecated_settings(settings) sp = SpiderParser(settings, spider_name, args) results = sp.run() return results finally: os.chdir(work_cwd)
Example #8
Source File: youtube_history_spider.py From Youtube-Watch-History-Scraper with The Unlicense | 6 votes |
def __init__(self, *args, **kwargs): super(YoutubeHistorySpider, self).__init__(*args, **kwargs) settings = get_project_settings() hf = settings.get("CHROME_HEADERS_FILE") cj = settings.get("COOKIES_JSON") if hf: ch = ChromeRequest.from_file(hf) self.init_cookies = ch.cookies elif cj: with open (cj, 'r') as fh: cookies = parse_cookies(fh.read()) self.init_cookies = cookies if not hasattr(self, "init_cookies"): raise ValueError("Need to specify 'CHROME_HEADERS_FILE' "+ "or 'COOKIES_JSON' in settings.")
Example #9
Source File: utils.py From legco-watch with MIT License | 5 votes |
def list_spiders(): settings = get_project_settings() crawler = Crawler(settings) return crawler.spiders.list()
Example #10
Source File: asus_spider.py From uefi-spider with MIT License | 5 votes |
def _get_uas(self): ### Edit user agent settings = get_project_settings() return " ".join([ settings.get("USER_AGENT"), ### The ASP.NET application is checking for async-compatible browsers. "Mozilla/5.0 (Windows NT 6.1; WOW64)" #"AppleWebKit/537.36 (KHTML, like Gecko)", #"Chrome/34.0.1847.116", #"Safari/537.36", ]) pass
Example #11
Source File: scrapyctl.py From kmanga with GNU General Public License v3.0 | 5 votes |
def _get_settings(self): """Return the current scrapy settings.""" if 'SCRAPY_SETTINGS_MODULE' not in os.environ: _s = settings.SCRAPY_SETTINGS_MODULE os.environ['SCRAPY_SETTINGS_MODULE'] = _s return get_project_settings()
Example #12
Source File: vmgirl.py From capturer with MIT License | 5 votes |
def __init__(self): settings = get_project_settings() self.user_data_dir = settings.get('USER_DATA_DIR')
Example #13
Source File: UserBoardsSpider.py From capturer with MIT License | 5 votes |
def __init__(self): settings = get_project_settings() self.username = settings.get('USERNAME') self.hostname = 'http://huaban.com' self.start_urls = ['{0}/{1}/'.format(self.hostname, self.username)]
Example #14
Source File: cli.py From StrepHit with GNU General Public License v3.0 | 5 votes |
def crawl(spider_name, results_dir): """ Run one or more spiders """ settings = get_project_settings() # prevent scrapy from configuring its own logging, since we already have it settings.set('LOG_ENABLED', False) process = CrawlerProcess(settings) for s in spider_name: process.settings.set('FEED_URI', 'file://%s.jsonlines' % os.path.join(results_dir, s)) process.settings.set('FEED_FORMAT', 'jsonlines') spider = process.spider_loader.load(s) process.crawl(spider) process.start()
Example #15
Source File: models.py From scrapy-tutorial with MIT License | 5 votes |
def db_connect(): """ Performs database connection using database settings from settings.py. Returns sqlalchemy engine instance """ return create_engine(get_project_settings().get("CONNECTION_STRING"))
Example #16
Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def list(): """List all available spiders.""" settings = get_project_settings() settings["LOG_ENABLED"] = False process = CrawlerProcess(settings) for s in sorted(process.spider_loader.list()): print(s)
Example #17
Source File: parser.py From Gerapy with MIT License | 5 votes |
def get_start_requests(project_path, spider_name): """ get start requests :param project_path: project path :param spider_name: spider name :return: """ work_cwd = os.getcwd() try: # change work dir os.chdir(project_path) # load settings settings = get_project_settings() check_deprecated_settings(settings) runner = CrawlerRunner(settings=settings) # add crawler spider_cls = runner.spider_loader.load(spider_name) runner.crawl(spider_cls) # get crawler crawler = list(runner.crawlers)[0] # get spider by crawler spider = crawler.spider # get start requests requests = list(spider.start_requests()) if not requests and hasattr(spider, 'start'): requests = list(spider.start()) requests = list(map(lambda r: process_request(r), requests)) return {'finished': True, 'requests': requests} finally: os.chdir(work_cwd)
Example #18
Source File: crawler.py From fp-server with MIT License | 5 votes |
def init_crawler_runner(): crochet.setup() init_scrapy_env() settings = get_project_settings() global CRAWLER_RUNNER CRAWLER_RUNNER = CrawlerRunner(settings) logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER) # TODO: move these to config file?
Example #19
Source File: run.py From openslack-crawler with Apache License 2.0 | 5 votes |
def run_spider(spider, *args): print spider settings = get_project_settings() process = CrawlerProcess(settings) process.crawl(spider, 0, 0, 0) process.start()
Example #20
Source File: run.py From openslack-crawler with Apache License 2.0 | 5 votes |
def run_spider2(spider, *args): configure_logging() runner = CrawlerRunner(get_project_settings()) runner.crawl(spider, *args) runner.crawl(spider, *args) d = runner.join() # d = runner.crawl(spider, *args) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished
Example #21
Source File: feedexport.py From learn_python3_spider with MIT License | 5 votes |
def __init__(self, uri, access_key=None, secret_key=None, acl=None): # BEGIN Backward compatibility for initialising without keys (and # without using from_crawler) no_defaults = access_key is None and secret_key is None if no_defaults: from scrapy.utils.project import get_project_settings settings = get_project_settings() if 'AWS_ACCESS_KEY_ID' in settings or 'AWS_SECRET_ACCESS_KEY' in settings: import warnings from scrapy.exceptions import ScrapyDeprecationWarning warnings.warn( "Initialising `scrapy.extensions.feedexport.S3FeedStorage` " "without AWS keys is deprecated. Please supply credentials or " "use the `from_crawler()` constructor.", category=ScrapyDeprecationWarning, stacklevel=2 ) access_key = settings['AWS_ACCESS_KEY_ID'] secret_key = settings['AWS_SECRET_ACCESS_KEY'] # END Backward compatibility u = urlparse(uri) self.bucketname = u.hostname self.access_key = u.username or access_key self.secret_key = u.password or secret_key self.is_botocore = is_botocore() self.keyname = u.path[1:] # remove first "/" self.acl = acl if self.is_botocore: import botocore.session session = botocore.session.get_session() self.s3_client = session.create_client( 's3', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key) else: import boto self.connect_s3 = boto.connect_s3
Example #22
Source File: online.py From scrapy-cluster with MIT License | 5 votes |
def setUp(self): self.settings = get_project_settings() self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test") # set up redis self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'], port=self.settings['REDIS_PORT'], db=self.settings['REDIS_DB']) try: self.redis_conn.info() except ConnectionError: print("Could not connect to Redis") # plugin is essential to functionality sys.exit(1) # clear out older test keys if any keys = self.redis_conn.keys("test-spider:*") for key in keys: self.redis_conn.delete(key) # set up kafka to consumer potential result self.consumer = KafkaConsumer( "demo_test.crawled_firehose", bootstrap_servers=self.settings['KAFKA_HOSTS'], group_id="demo-id", auto_commit_interval_ms=10, consumer_timeout_ms=5000, auto_offset_reset='earliest' ) time.sleep(1)
Example #23
Source File: full_analysis.py From jd_analysis with GNU Lesser General Public License v3.0 | 5 votes |
def runspider(self): configure_logging(install_root_handler = False) s = get_project_settings() runner = CrawlerRunner(settings = s) @defer.inlineCallbacks def crawl(**spargs): yield runner.crawl(JDItemInfoSpider, **spargs) yield runner.crawl(JDCommentSpider, **spargs) reactor.stop() crawl(**self.spargs) reactor.run() # the script will block here until the last crawl call is finished # 调度分析
Example #24
Source File: crawler.py From Sitadel with GNU General Public License v3.0 | 5 votes |
def crawl(url, user_agent): try: output = Services.get("output") # Settings for the crawler settings = get_project_settings() settings.set("USER_AGENT", user_agent) settings.set("LOG_LEVEL", "CRITICAL") settings.set("RETRY_ENABLED", False) settings.set("CONCURRENT_REQUESTS", 15) # Create the process that will perform the crawl output.info("Start crawling the target website") process = CrawlerProcess(settings) allowed_domains.append(str(urlparse(url).hostname)) process.crawl( SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains ) process.start() # Clean the results clean_urls = [] for u in urls: try: new_url = urlparse(u).geturl() clean_urls.append(new_url) except ValueError: continue return clean_urls except KeyboardInterrupt: process.stop() raise
Example #25
Source File: run.py From PythonScrapyBasicSetup with MIT License | 5 votes |
def run(): configure_logging() # importing project settings for further usage # mainly because of the middlewares settings = get_project_settings() runner = CrawlerRunner(settings) # running spiders sequentially (non-distributed) @defer.inlineCallbacks def crawl(): yield runner.crawl(IPTesterSpider) yield runner.crawl(UATesterSpider) reactor.stop() crawl() reactor.run() # block until the last call
Example #26
Source File: proxy.py From PythonScrapyBasicSetup with MIT License | 5 votes |
def import_settings(self): settings = get_project_settings() self.password = settings['AUTH_PASSWORD'] self.http_proxy = settings['HTTP_PROXY'] self.control_port = settings['CONTROL_PORT'] self.max_req_per_ip = settings['MAX_REQ_PER_IP'] self.exit_nodes = settings['EXIT_NODES'] if self.exit_nodes: with Controller.from_port(port=self.control_port) as controller: controller.authenticate(self.password) controller.set_conf('ExitNodes', self.exit_nodes) controller.close()
Example #27
Source File: spiders.py From daywatch with MIT License | 5 votes |
def run_spider_instance(spider_class, site_id, main_url): """Run a spider given its spider class. For example, importing the TestSpider and passing it to this function will run it.""" spider = spider_class(site_id=site_id, main_url=main_url) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # Scrapy uses a deprecated Twisted interface. Until the fix makes it to a # new version (>0.24.4), we'll use this so deprecation warnings don't # clutter the output crawler.signals.connect(reactor.stop, signal=signals.spider_closed) warnings.filterwarnings("ignore", category=DeprecationWarning) crawler.crawl(spider) crawler.start() reactor.run()
Example #28
Source File: cmdline.py From learn_python3_spider with MIT License | 5 votes |
def execute(argv=None, settings=None): if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
Example #29
Source File: feedexport.py From learn_python3_spider with MIT License | 5 votes |
def __init__(self, uri, access_key=None, secret_key=None, acl=None): # BEGIN Backward compatibility for initialising without keys (and # without using from_crawler) no_defaults = access_key is None and secret_key is None if no_defaults: from scrapy.utils.project import get_project_settings settings = get_project_settings() if 'AWS_ACCESS_KEY_ID' in settings or 'AWS_SECRET_ACCESS_KEY' in settings: import warnings from scrapy.exceptions import ScrapyDeprecationWarning warnings.warn( "Initialising `scrapy.extensions.feedexport.S3FeedStorage` " "without AWS keys is deprecated. Please supply credentials or " "use the `from_crawler()` constructor.", category=ScrapyDeprecationWarning, stacklevel=2 ) access_key = settings['AWS_ACCESS_KEY_ID'] secret_key = settings['AWS_SECRET_ACCESS_KEY'] # END Backward compatibility u = urlparse(uri) self.bucketname = u.hostname self.access_key = u.username or access_key self.secret_key = u.password or secret_key self.is_botocore = is_botocore() self.keyname = u.path[1:] # remove first "/" self.acl = acl if self.is_botocore: import botocore.session session = botocore.session.get_session() self.s3_client = session.create_client( 's3', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key) else: import boto self.connect_s3 = boto.connect_s3
Example #30
Source File: utils.py From scrapy-autounit with BSD 3-Clause "New" or "Revised" License | 5 votes |
def prepare_callback_replay(fixture_path, encoding="utf-8"): with open(str(fixture_path), 'rb') as f: raw_data = f.read() fixture_info = unpickle_data(decompress_data(raw_data), encoding) if 'fixture_version' in fixture_info: encoding = fixture_info['encoding'] data = unpickle_data(fixture_info['data'], encoding) else: data = fixture_info # legacy tests settings = get_project_settings() spider_name = data.get('spider_name') if not spider_name: # legacy tests spider_name = os.path.basename( os.path.dirname( os.path.dirname(fixture_path) ) ) spider_cls = get_spider_class(spider_name, settings) spider_cls.update_settings(settings) for k, v in data.get('settings', {}).items(): settings.set(k, v, 50) crawler = Crawler(spider_cls, settings) spider_args_in = data.get('spider_args', data.get('spider_args_in', {})) spider = spider_cls.from_crawler(crawler, **spider_args_in) crawler.spider = spider return data, crawler, spider, settings