Python Examples of scrapy.utils.project.get_project

Source File: run_spider.py From IPProxyTool with MIT License

8 votes

def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)

Source File: runner.py From In2ItChicago with GNU General Public License v3.0

7 votes

def run():
    config.connect_to_client()

    print('Running event processor...')

    crawlerProcess = CrawlerProcess(get_project_settings())

    settings = project.get_project_settings()
    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    spiders = spider_loader.list()
    classes = [s for s in (spider_loader.load(name) for name in spiders if config.spider_name == None or name == config.spider_name) if s.enabled]

    crawlerProcess = CrawlerProcess(get_project_settings())

    for spider_class in classes:
        crawlerProcess.crawl(spider_class)

    crawlerProcess.start()
    crawlerProcess.join()

    print('Event processor completed')

    session = HttpUtils.get_session()
    events = session.get(config.get_events, params = {})

    if len(events.json()) > 0:
        print('Data retrieved successfully')
    else:
        print('No data retrieved')

Source File: spider.py From stock with Apache License 2.0

6 votes

def __init__(self):
    self.spider = HqSpider()
    self.crawler = crawler = Crawler(get_project_settings())
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(self.spider)
    dispatcher.connect(self._dont_close_me, signals.spider_idle)
    self.thread = None
    self._started = False
    self._stopped = False

Source File: real_time_analysis.py From jd_analysis with GNU Lesser General Public License v3.0

6 votes

def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message))

Source File: run_spider.py From jd_analysis with GNU Lesser General Public License v3.0

6 votes

def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')
    guid = spargs.get('guid')
    product_id = spargs.get('product_id')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message))

Source File: __init__.py From fooltrader with MIT License

6 votes

def crawl(spider, setting):
    process = CrawlerProcess({**get_project_settings(), **setting})
    process.crawl(spider)
    process.start()

Source File: parser.py From Gerapy with MIT License

6 votes

def get_follow_requests_and_items(project_path, spider_name, args):
    """
    get follows
    :param project_path:
    :param spider_name:
    :param args:
    :return:
    """
    work_cwd = os.getcwd()
    try:
        os.chdir(project_path)
        settings = get_project_settings()
        check_deprecated_settings(settings)
        sp = SpiderParser(settings, spider_name, args)
        results = sp.run()
        return results
    finally:
        os.chdir(work_cwd)

Source File: youtube_history_spider.py From Youtube-Watch-History-Scraper with The Unlicense

6 votes

def __init__(self, *args, **kwargs):
        super(YoutubeHistorySpider, self).__init__(*args, **kwargs)
        settings = get_project_settings()
        hf = settings.get("CHROME_HEADERS_FILE")
        cj = settings.get("COOKIES_JSON")
        if hf:
            ch = ChromeRequest.from_file(hf)
            self.init_cookies = ch.cookies
        elif cj:
            with open (cj, 'r') as fh:
                cookies = parse_cookies(fh.read())
                self.init_cookies = cookies

        if not hasattr(self, "init_cookies"):
            raise ValueError("Need to specify 'CHROME_HEADERS_FILE' "+
                             "or 'COOKIES_JSON' in settings.")

Source File: utils.py From legco-watch with MIT License

5 votes

def list_spiders():
    settings = get_project_settings()
    crawler = Crawler(settings)
    return crawler.spiders.list()

Source File: asus_spider.py From uefi-spider with MIT License

5 votes

def _get_uas(self):
        ### Edit user agent
        settings = get_project_settings()
        return " ".join([
            settings.get("USER_AGENT"),
            ### The ASP.NET application is checking for async-compatible browsers.
            "Mozilla/5.0 (Windows NT 6.1; WOW64)"
            #"AppleWebKit/537.36 (KHTML, like Gecko)",
            #"Chrome/34.0.1847.116",
            #"Safari/537.36",
        ])
        pass

Source File: scrapyctl.py From kmanga with GNU General Public License v3.0

5 votes

def _get_settings(self):
        """Return the current scrapy settings."""
        if 'SCRAPY_SETTINGS_MODULE' not in os.environ:
            _s = settings.SCRAPY_SETTINGS_MODULE
            os.environ['SCRAPY_SETTINGS_MODULE'] = _s
        return get_project_settings()

Source File: vmgirl.py From capturer with MIT License

5 votes

def __init__(self):
        settings = get_project_settings()
        self.user_data_dir = settings.get('USER_DATA_DIR')

Source File: UserBoardsSpider.py From capturer with MIT License

5 votes

def __init__(self):
        settings = get_project_settings()
        self.username = settings.get('USERNAME')
        self.hostname = 'http://huaban.com'
        self.start_urls = ['{0}/{1}/'.format(self.hostname, self.username)]

Source File: cli.py From StrepHit with GNU General Public License v3.0

5 votes

def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()

Source File: models.py From scrapy-tutorial with MIT License

5 votes

def db_connect():
    """
    Performs database connection using database settings from settings.py.
    Returns sqlalchemy engine instance
    """
    return create_engine(get_project_settings().get("CONNECTION_STRING"))

Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def list():
    """List all available spiders."""
    settings = get_project_settings()
    settings["LOG_ENABLED"] = False
    process = CrawlerProcess(settings)
    for s in sorted(process.spider_loader.list()):
        print(s)

Source File: parser.py From Gerapy with MIT License

5 votes

def get_start_requests(project_path, spider_name):
    """
    get start requests
    :param project_path: project path
    :param spider_name: spider name
    :return:
    """
    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        # load settings
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        # add crawler
        spider_cls = runner.spider_loader.load(spider_name)
        runner.crawl(spider_cls)
        # get crawler
        crawler = list(runner.crawlers)[0]
        # get spider by crawler
        spider = crawler.spider
        # get start requests
        requests = list(spider.start_requests())
        if not requests and hasattr(spider, 'start'):
            requests = list(spider.start())
        requests = list(map(lambda r: process_request(r), requests))
        return {'finished': True, 'requests': requests}
    finally:
        os.chdir(work_cwd)

Source File: crawler.py From fp-server with MIT License

5 votes

def init_crawler_runner():
    crochet.setup()
    init_scrapy_env()
    settings = get_project_settings()
    global CRAWLER_RUNNER
    CRAWLER_RUNNER = CrawlerRunner(settings)
    logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)


# TODO: move these to config file?

Source File: run.py From openslack-crawler with Apache License 2.0

5 votes

def run_spider(spider, *args):
    print spider
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    process.crawl(spider, 0, 0, 0)
    process.start()

Source File: run.py From openslack-crawler with Apache License 2.0

5 votes

def run_spider2(spider, *args):
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    runner.crawl(spider, *args)
    runner.crawl(spider, *args)
    d = runner.join()
    # d = runner.crawl(spider, *args)
    d.addBoth(lambda _: reactor.stop())

    reactor.run()  # the script will block here until all crawling jobs are finished

Source File: feedexport.py From learn_python3_spider with MIT License

5 votes

def __init__(self, uri, access_key=None, secret_key=None, acl=None):
        # BEGIN Backward compatibility for initialising without keys (and
        # without using from_crawler)
        no_defaults = access_key is None and secret_key is None
        if no_defaults:
            from scrapy.utils.project import get_project_settings
            settings = get_project_settings()
            if 'AWS_ACCESS_KEY_ID' in settings or 'AWS_SECRET_ACCESS_KEY' in settings:
                import warnings
                from scrapy.exceptions import ScrapyDeprecationWarning
                warnings.warn(
                    "Initialising `scrapy.extensions.feedexport.S3FeedStorage` "
                    "without AWS keys is deprecated. Please supply credentials or "
                    "use the `from_crawler()` constructor.",
                    category=ScrapyDeprecationWarning,
                    stacklevel=2
                )
                access_key = settings['AWS_ACCESS_KEY_ID']
                secret_key = settings['AWS_SECRET_ACCESS_KEY']
        # END Backward compatibility
        u = urlparse(uri)
        self.bucketname = u.hostname
        self.access_key = u.username or access_key
        self.secret_key = u.password or secret_key
        self.is_botocore = is_botocore()
        self.keyname = u.path[1:]  # remove first "/"
        self.acl = acl
        if self.is_botocore:
            import botocore.session
            session = botocore.session.get_session()
            self.s3_client = session.create_client(
                's3', aws_access_key_id=self.access_key,
                aws_secret_access_key=self.secret_key)
        else:
            import boto
            self.connect_s3 = boto.connect_s3

Source File: online.py From scrapy-cluster with MIT License

5 votes

def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'],
                                      db=self.settings['REDIS_DB'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print("Could not connect to Redis")
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.consumer = KafkaConsumer(
            "demo_test.crawled_firehose",
            bootstrap_servers=self.settings['KAFKA_HOSTS'],
            group_id="demo-id",
            auto_commit_interval_ms=10,
            consumer_timeout_ms=5000,
            auto_offset_reset='earliest'
        )
        time.sleep(1)

Source File: full_analysis.py From jd_analysis with GNU Lesser General Public License v3.0

5 votes

def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(JDItemInfoSpider, **spargs)
            yield runner.crawl(JDCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

    # 调度分析

Source File: crawler.py From Sitadel with GNU General Public License v3.0

5 votes

def crawl(url, user_agent):
    try:
        output = Services.get("output")

        # Settings for the crawler
        settings = get_project_settings()
        settings.set("USER_AGENT", user_agent)
        settings.set("LOG_LEVEL", "CRITICAL")
        settings.set("RETRY_ENABLED", False)
        settings.set("CONCURRENT_REQUESTS", 15)

        # Create the process that will perform the crawl
        output.info("Start crawling the target website")
        process = CrawlerProcess(settings)
        allowed_domains.append(str(urlparse(url).hostname))
        process.crawl(
            SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains
        )
        process.start()

        # Clean the results
        clean_urls = []
        for u in urls:
            try:
                new_url = urlparse(u).geturl()
                clean_urls.append(new_url)
            except ValueError:
                continue
        return clean_urls

    except KeyboardInterrupt:
        process.stop()
        raise

Source File: run.py From PythonScrapyBasicSetup with MIT License

5 votes

def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call

Source File: proxy.py From PythonScrapyBasicSetup with MIT License

5 votes

def import_settings(self):
        settings = get_project_settings()
        self.password = settings['AUTH_PASSWORD']
        self.http_proxy = settings['HTTP_PROXY']
        self.control_port = settings['CONTROL_PORT']
        self.max_req_per_ip = settings['MAX_REQ_PER_IP']

        self.exit_nodes = settings['EXIT_NODES']
        if self.exit_nodes:
            with Controller.from_port(port=self.control_port) as controller:
                controller.authenticate(self.password)
                controller.set_conf('ExitNodes', self.exit_nodes)
                controller.close()

Source File: spiders.py From daywatch with MIT License

5 votes

def run_spider_instance(spider_class, site_id, main_url):
    """Run a spider given its spider class. For example, importing the TestSpider
and passing it to this function will run it."""
    spider = spider_class(site_id=site_id, main_url=main_url)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # Scrapy uses a deprecated Twisted interface. Until the fix makes it to a
    # new version (>0.24.4), we'll use this so deprecation warnings don't
    # clutter the output
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    crawler.crawl(spider)
    crawler.start()
    reactor.run()

Source File: cmdline.py From learn_python3_spider with MIT License

5 votes

def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)

Source File: feedexport.py From learn_python3_spider with MIT License

5 votes

def __init__(self, uri, access_key=None, secret_key=None, acl=None):
        # BEGIN Backward compatibility for initialising without keys (and
        # without using from_crawler)
        no_defaults = access_key is None and secret_key is None
        if no_defaults:
            from scrapy.utils.project import get_project_settings
            settings = get_project_settings()
            if 'AWS_ACCESS_KEY_ID' in settings or 'AWS_SECRET_ACCESS_KEY' in settings:
                import warnings
                from scrapy.exceptions import ScrapyDeprecationWarning
                warnings.warn(
                    "Initialising `scrapy.extensions.feedexport.S3FeedStorage` "
                    "without AWS keys is deprecated. Please supply credentials or "
                    "use the `from_crawler()` constructor.",
                    category=ScrapyDeprecationWarning,
                    stacklevel=2
                )
                access_key = settings['AWS_ACCESS_KEY_ID']
                secret_key = settings['AWS_SECRET_ACCESS_KEY']
        # END Backward compatibility
        u = urlparse(uri)
        self.bucketname = u.hostname
        self.access_key = u.username or access_key
        self.secret_key = u.password or secret_key
        self.is_botocore = is_botocore()
        self.keyname = u.path[1:]  # remove first "/"
        self.acl = acl
        if self.is_botocore:
            import botocore.session
            session = botocore.session.get_session()
            self.s3_client = session.create_client(
                's3', aws_access_key_id=self.access_key,
                aws_secret_access_key=self.secret_key)
        else:
            import boto
            self.connect_s3 = boto.connect_s3

Source File: utils.py From scrapy-autounit with BSD 3-Clause "New" or "Revised" License

5 votes

def prepare_callback_replay(fixture_path, encoding="utf-8"):
    with open(str(fixture_path), 'rb') as f:
        raw_data = f.read()

    fixture_info = unpickle_data(decompress_data(raw_data), encoding)
    if 'fixture_version' in fixture_info:
        encoding = fixture_info['encoding']
        data = unpickle_data(fixture_info['data'], encoding)
    else:
        data = fixture_info  # legacy tests

    settings = get_project_settings()

    spider_name = data.get('spider_name')
    if not spider_name:  # legacy tests
        spider_name = os.path.basename(
            os.path.dirname(
                os.path.dirname(fixture_path)
            )
        )

    spider_cls = get_spider_class(spider_name, settings)
    spider_cls.update_settings(settings)

    for k, v in data.get('settings', {}).items():
        settings.set(k, v, 50)

    crawler = Crawler(spider_cls, settings)
    spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
    spider = spider_cls.from_crawler(crawler, **spider_args_in)
    crawler.spider = spider

    return data, crawler, spider, settings

Python scrapy.utils.project.get_project_settings() Examples