Python tldextract.TLDExtract() Examples
The following are 13
code examples of tldextract.TLDExtract().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tldextract
, or try the search function
.
Example #1
Source File: action_handler.py From scrapy-cluster with MIT License | 6 votes |
def setup(self, settings): ''' Setup redis and tldextract ''' self.extract = tldextract.TLDExtract() self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings.get('REDIS_DB')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in ActionHandler") except ConnectionError: self.logger.error("Failed to connect to Redis in ActionHandler") # plugin is essential to functionality sys.exit(1)
Example #2
Source File: zookeeper_handler.py From scrapy-cluster with MIT License | 6 votes |
def setup(self, settings): ''' Setup redis and tldextract ''' self.extract = tldextract.TLDExtract() self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings.get('REDIS_DB')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in ZookeeperHandler") except ConnectionError: self.logger.error("Failed to connect to Redis in ZookeeperHandler") # plugin is essential to functionality sys.exit(1)
Example #3
Source File: scraper_handler.py From scrapy-cluster with MIT License | 6 votes |
def setup(self, settings): ''' Setup redis and tldextract ''' self.extract = tldextract.TLDExtract() self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings.get('REDIS_DB')) try: self.redis_conn.info() self.logger.debug("Connected to Redis in ScraperHandler") except ConnectionError: self.logger.error("Failed to connect to Redis in ScraperHandler") # plugin is essential to functionality sys.exit(1)
Example #4
Source File: domain.py From OneForAll with GNU General Public License v3.0 | 6 votes |
def extract(self): """ extract domain >>> d = Domain('www.example.com') <domain.Domain object> >>> d.extract() ExtractResult(subdomain='www', domain='example', suffix='com') :return: extracted domain results """ data_storage_dir = setting.data_storage_dir extract_cache_file = data_storage_dir.joinpath('public_suffix_list.dat') tldext = tldextract.TLDExtract(extract_cache_file) result = self.match() if result: return tldext(result) else: return None
Example #5
Source File: url_checker.py From YaYaGen with BSD 2-Clause "Simplified" License | 5 votes |
def __init__(self, jconfig, vtapikey): """ Load the DOMAINS_WHITELIST and setup the tld-extractor """ try: with open(jconfig['DOMAINS_WHITELIST'], 'rb') as f_in: UrlChecker.whitelist = pickle.load(f_in) except: log.error("URL whitelist loading error") UrlChecker.whitelist = list() cache_file = jconfig['TOP_DOMAINS_CACHE'] UrlChecker.__tld = tldextract.TLDExtract(cache_file=cache_file) UrlChecker.__vtapikey = vtapikey
Example #6
Source File: test_plugins.py From scrapy-cluster with MIT License | 5 votes |
def test_scrape_handler(self): valid = { "url": "www.stuff.com", "crawlid": "abc124", "appid": "testapp", "spiderid": "link", "priority": 5, } handler = ScraperHandler() handler.extract = tldextract.TLDExtract() handler.redis_conn = MagicMock() # check it is added to redis handler.redis_conn.zadd = MagicMock(side_effect=AssertionError("added")) try: handler.handle(valid) self.fail("Action not called") except AssertionError as e: self.assertEquals("added", str(e)) # check timeout is added handler.redis_conn.zadd = MagicMock() handler.redis_conn.set = MagicMock(side_effect=AssertionError("expires")) valid['expires'] = 124242 try: handler.handle(valid) self.fail("Expires not called") except AssertionError as e: self.assertEquals("expires", str(e))
Example #7
Source File: distributed_scheduler.py From scrapy-cluster with MIT License | 5 votes |
def __init__(self, server, persist, update_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout): ''' Initialize the scheduler ''' self.redis_conn = server self.persist = persist self.queue_dict = {} self.update_interval = update_int self.hits = hits self.window = window self.moderated = mod self.rfp_timeout = timeout self.ip_update_interval = ip_refresh self.add_type = add_type self.add_ip = add_ip self.item_retires = retries self.logger = logger self.ip_regex = re.compile(ip_regex) self.backlog_blacklist = backlog_blacklist self.queue_timeout = queue_timeout # set up tldextract self.extract = tldextract.TLDExtract() self.update_ipaddress() # if we need better uuid's mod this line self.my_uuid = str(uuid.uuid4()).split('-')[4]
Example #8
Source File: utils.py From yeti with Apache License 2.0 | 5 votes |
def tldextract_parser(url): parts = None try: parts = TLDExtract(**tld_extract_dict)(url) except Exception as e: logging.error(e) return parts
Example #9
Source File: web_utils.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def split_hostname(hostname): """ Splits a hostname into its subdomain, domain and TLD parts. For example: >>> from golismero.api.net.web_utils import ParsedURL >>> d = ParsedURL("http://www.example.com/") >>> d.split_hostname() ('www', 'example', 'com') >>> d = ParsedURL("http://some.subdomain.of.example.co.uk/") >>> d.split_hostname() ('some.subdomain.of', 'example', 'co.uk') >>> '.'.join(d.split_hostname()) 'some.subdomain.of.example.co.uk' :param hostname: Hostname to split. :type hostname: str :returns: Subdomain, domain and TLD. :rtype: tuple(str, str, str) """ extract = TLDExtract(fetch = False) result = extract( to_utf8(hostname) ) return result.subdomain, result.domain, result.suffix #------------------------------------------------------------------------------
Example #10
Source File: __init__.py From recipe-scrapers with MIT License | 5 votes |
def get_domain(url): tldextract = TLDExtract(suffix_list_urls=None) url_info = tldextract(url) return "{}.{}".format(url_info.domain, url_info.suffix)
Example #11
Source File: domaintools.py From metadoc with MIT License | 5 votes |
def get_domain(self, url): no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None) tld = no_fetch_extract(url) self.domain = "{}.{}".format(tld.domain, tld.suffix)
Example #12
Source File: client.py From lexicon with MIT License | 4 votes |
def __init__(self, config=None): if not config: # If there is not config specified, we load a non-interactive configuration. self.config = non_interactive_config_resolver() elif not isinstance(config, ConfigResolver): # If config is not a ConfigResolver, we are in a legacy situation. # We protect this part of the Client API. self.config = legacy_config_resolver(config) else: self.config = config # Validate configuration self._validate_config() runtime_config = {} # Process domain, strip subdomain domain_extractor = tldextract.TLDExtract(cache_file=TLDEXTRACT_CACHE_FILE, include_psl_private_domains=True) domain_parts = domain_extractor( self.config.resolve('lexicon:domain')) runtime_config['domain'] = '{0}.{1}'.format( domain_parts.domain, domain_parts.suffix) if self.config.resolve('lexicon:delegated'): # handle delegated domain delegated = self.config.resolve('lexicon:delegated').rstrip('.') if delegated != runtime_config.get('domain'): # convert to relative name if delegated.endswith(runtime_config.get('domain')): delegated = delegated[:-len(runtime_config.get('domain'))] delegated = delegated.rstrip('.') # update domain runtime_config['domain'] = '{0}.{1}'.format( delegated, runtime_config.get('domain')) self.action = self.config.resolve('lexicon:action') self.provider_name = (self.config.resolve('lexicon:provider_name') or self.config.resolve('lexicon:provider')) self.config.add_config_source(DictConfigSource(runtime_config), 0) provider_module = importlib.import_module( 'lexicon.providers.' + self.provider_name) provider_class = getattr(provider_module, 'Provider') self.provider = provider_class(self.config)
Example #13
Source File: BTG.py From BTG with GNU General Public License v3.0 | 4 votes |
def extend_IOC(self, argument, observable_list): """ Extending IOC from URL into URL + DOMAIN + IP """ if config['offline']: # Cache search # TODO if "TLDE_cache" in config: cache_file = "%s%s" % (config['temporary_cache_path'], config['TLDE_cache']) cache_extract = tldextract.TLDExtract(cache_file=cache_file) extract = cache_extract(argument) else: # Live search no_cache_extract = tldextract.TLDExtract(cache_file=False) extract = no_cache_extract(argument) try: registered_domain = extract.registered_domain except: registered_domain = None try: suffix_domain = extract.suffix except: suffix_domain = None try: complete_domain = '.'.join(part for part in extract if part) except: complete_domain = None domains = [registered_domain, suffix_domain, complete_domain] IPs = [None, None, None] if not config["offline"]: for domain in domains: try: IP = socket.gethostbyname(domain) except: IP = None IPs.append(IP) for domain in domains: if domain is not None and domain not in observable_list: observable_list.append(domain) for IP in IPs: if IP is not None and IP not in observable_list: observable_list.append(IP)