Python scrapy.conf.settings() Examples
The following are 14
code examples of scrapy.conf.settings().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.conf
, or try the search function
.
Example #1
Source File: pipelines.py From crawler_examples with Apache License 2.0 | 5 votes |
def __init__(self): host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] name = settings['MONGODB_DBNAME'] client = MongoClient(host=host,port=port) db = client[name] self.col = db[settings['MONGODB_DOCNAME']]
Example #2
Source File: pipelines.py From SourceCodeOfBook with MIT License | 5 votes |
def __init__(self): host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] db_name = settings['MONGODB_DBNAME'] client = pymongo.MongoClient(host=host, port=port) db = client[db_name] self.post = db[settings['MONGODB_DOCNAME']]
Example #3
Source File: pipelines.py From SourceCodeOfBook with MIT License | 5 votes |
def __init__(self): self.db = pymongo.MongoClient()[settings['MONGODB_DB']] self.handler = None
Example #4
Source File: pipelines.py From SourceCodeOfBook with MIT License | 5 votes |
def process_error(self, item): if not self.handler: self.handler = self.db[settings['MONGODB_ERROR']] self.handler.insert_one(dict(item))
Example #5
Source File: middlewares.py From SourceCodeOfBook with MIT License | 5 votes |
def process_request(self, request, spider): proxy = random.choice(settings['PROXIES']) request.meta['proxy'] = proxy
Example #6
Source File: middlewares.py From SourceCodeOfBook with MIT License | 5 votes |
def process_request(self, request, spider): ua = random.choice(settings['USER_AGENT_LIST']) request.headers['User-Agent'] = ua
Example #7
Source File: pipelines.py From LotteryTicket with MIT License | 5 votes |
def __init__(self): self.server = settings['MONGODB_SERVER'] self.port = settings['MONGODB_PORT'] self.db = settings['MONGODB_DB'] self.col = settings['MONGODB_COLLECTION'] connection = pymongo.Connection(self.server, self.port) db = connection[self.db] self.collection = db[self.col]
Example #8
Source File: pipelines.py From NewsCrawler with MIT License | 5 votes |
def __init__(self): conn = pymongo.Connection( settings['MONGO_CONF']['host'], settings['MONGO_CONF']['port'] ) db = conn[settings['MONGO_CONF']['db']] self.news_collection = db[settings['MONGO_CONF']['collection']]
Example #9
Source File: pipelines.py From NewsCrawler with MIT License | 5 votes |
def __init__(self): conn = pymongo.Connection( settings['MONGO_CONF']['host'], settings['MONGO_CONF']['port'] ) db = conn[settings['MONGO_CONF']['db']] self.subscription_collection = db[settings['MONGO_CONF']['subscription_collection']]
Example #10
Source File: pipelines.py From Wenshu_Spider with MIT License | 5 votes |
def __init__(self): host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbname = settings['MONGODB_DBNAME'] docname = settings['MONGODB_DOCNAME'] self.client = pymongo.MongoClient(host=host,port=port) db = self.client[dbname] db[docname].ensure_index('casedocid', unique=True) # 设置文书ID为唯一索引,避免插入重复数据 self.post = db[docname]
Example #11
Source File: pipelines.py From openslack-crawler with Apache License 2.0 | 5 votes |
def __init__(self): import pymongo connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) self.db = connection[settings['MONGODB_DB']] self.collection = self.db[settings['MONGODB_COLLECTION']] if self.__get_uniq_key() is not None: self.collection.create_index(self.__get_uniq_key(), unique=True)
Example #12
Source File: pipelines.py From openslack-crawler with Apache License 2.0 | 5 votes |
def process_item(self, item, spider): if self.__get_uniq_key() is None: self.collection.insert(dict(item)) else: self.collection.update( {self.__get_uniq_key(): item[self.__get_uniq_key()]}, dict(item), upsert=True) log.msg("Item wrote to MongoDB database %s/%s" % (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), level=log.DEBUG, spider=spider) return item
Example #13
Source File: pipelines.py From openslack-crawler with Apache License 2.0 | 5 votes |
def __get_uniq_key(self): if not settings['MONGODB_UNIQ_KEY'] or settings['MONGODB_UNIQ_KEY'] == "": return None return settings['MONGODB_UNIQ_KEY']
Example #14
Source File: pipelines.py From Wenshu_Spider with MIT License | 4 votes |
def process_item(self, item, spider): '''插入数据''' try: data = dict(item) self.post.insert_one(data) return item except DuplicateKeyError: # 索引相同,即为重复数据,捕获错误 spider.logger.debug('Duplicate key error collection') return item # 2.异步存储item - 不行!插入不了数据! (参考:https://zhuanlan.zhihu.com/p/44003499) # from twisted.internet import defer, reactor # class WenshuPipeline(object): # def __init__(self, mongo_host, mongo_port, mongo_db, mongo_doc): # self.mongo_host = mongo_host # self.mongo_port = mongo_port # self.mongo_db = mongo_db # self.mongo_doc = mongo_doc # # @classmethod # def from_crawler(cls, crawler): # return cls( # mongo_host=crawler.settings.get('MONGODB_HOST'), # mongo_port=crawler.settings.get('MONGODB_PORT'), # mongo_db=crawler.settings.get('MONGODB_DBNAME'), # mongo_doc=crawler.settings.get('MONGODB_DOCNAME'), # ) # # def open_spider(self, spider): # self.client = pymongo.MongoClient(host=self.mongo_host,port=self.mongo_port) # self.mongodb = self.client[self.mongo_db] # self.mongodb[self.mongo_doc].create_index('id', unique=True) # 创建索引,避免插入数据 # # def close_spider(self, spider): # self.client.close() # # # 下面的操作是重点 # @defer.inlineCallbacks # def process_item(self, item, spider): # out = defer.Deferred() # reactor.callInThread(self._insert, item, out, spider) # yield out # defer.returnValue(item) # return item # # def _insert(self, item, out, spider): # time.sleep(10) # try: # self.mongodb[self.mongo_doc].insert_one(dict(item)) # reactor.callFromThread(out.callback, item) # except DuplicateKeyError: # # 索引相同,即为重复数据,捕获错误 # spider.logger.debug('duplicate key error collection') # reactor.callFromThread(out.callback, item)