Python Examples of tldextract.TLDExtract

Source File: action_handler.py From scrapy-cluster with MIT License

6 votes

def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ActionHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ActionHandler")
            # plugin is essential to functionality
            sys.exit(1)

Source File: zookeeper_handler.py From scrapy-cluster with MIT License

6 votes

def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ZookeeperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ZookeeperHandler")
            # plugin is essential to functionality
            sys.exit(1)

Source File: scraper_handler.py From scrapy-cluster with MIT License

6 votes

def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ScraperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ScraperHandler")
            # plugin is essential to functionality
            sys.exit(1)

Source File: domain.py From OneForAll with GNU General Public License v3.0

6 votes

def extract(self):
        """
        extract domain

        >>> d = Domain('www.example.com')
        <domain.Domain object>
        >>> d.extract()
        ExtractResult(subdomain='www', domain='example', suffix='com')

        :return: extracted domain results
        """
        data_storage_dir = setting.data_storage_dir
        extract_cache_file = data_storage_dir.joinpath('public_suffix_list.dat')
        tldext = tldextract.TLDExtract(extract_cache_file)
        result = self.match()
        if result:
            return tldext(result)
        else:
            return None

Source File: url_checker.py From YaYaGen with BSD 2-Clause "Simplified" License

5 votes

def __init__(self, jconfig, vtapikey):
        """
        Load the DOMAINS_WHITELIST and setup the tld-extractor
        """
        try:
            with open(jconfig['DOMAINS_WHITELIST'], 'rb') as f_in:
                UrlChecker.whitelist = pickle.load(f_in)
        except:
            log.error("URL whitelist loading error")
            UrlChecker.whitelist = list()
        cache_file = jconfig['TOP_DOMAINS_CACHE']
        UrlChecker.__tld = tldextract.TLDExtract(cache_file=cache_file)
        UrlChecker.__vtapikey = vtapikey

Source File: test_plugins.py From scrapy-cluster with MIT License

5 votes

def test_scrape_handler(self):
        valid = {
            "url": "www.stuff.com",
            "crawlid": "abc124",
            "appid": "testapp",
            "spiderid": "link",
            "priority": 5,
        }
        handler = ScraperHandler()
        handler.extract = tldextract.TLDExtract()
        handler.redis_conn = MagicMock()

        # check it is added to redis
        handler.redis_conn.zadd = MagicMock(side_effect=AssertionError("added"))
        try:
            handler.handle(valid)
            self.fail("Action not called")
        except AssertionError as e:
            self.assertEquals("added", str(e))

        # check timeout is added
        handler.redis_conn.zadd = MagicMock()
        handler.redis_conn.set = MagicMock(side_effect=AssertionError("expires"))
        valid['expires'] = 124242
        try:
            handler.handle(valid)
            self.fail("Expires not called")
        except AssertionError as e:
            self.assertEquals("expires", str(e))

Source File: distributed_scheduler.py From scrapy-cluster with MIT License

5 votes

def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
                 backlog_blacklist, queue_timeout):
        '''
        Initialize the scheduler
        '''
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retires = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout

        # set up tldextract
        self.extract = tldextract.TLDExtract()

        self.update_ipaddress()

        # if we need better uuid's mod this line
        self.my_uuid = str(uuid.uuid4()).split('-')[4]

Source File: utils.py From yeti with Apache License 2.0

5 votes

def tldextract_parser(url):
    parts = None

    try:
        parts = TLDExtract(**tld_extract_dict)(url)
    except Exception as e:
        logging.error(e)

    return parts

Source File: web_utils.py From luscan-devel with GNU General Public License v2.0

5 votes

def split_hostname(hostname):
    """
    Splits a hostname into its subdomain, domain and TLD parts.

    For example:

    >>> from golismero.api.net.web_utils import ParsedURL
    >>> d = ParsedURL("http://www.example.com/")
    >>> d.split_hostname()
    ('www', 'example', 'com')
    >>> d = ParsedURL("http://some.subdomain.of.example.co.uk/")
    >>> d.split_hostname()
    ('some.subdomain.of', 'example', 'co.uk')
    >>> '.'.join(d.split_hostname())
    'some.subdomain.of.example.co.uk'

    :param hostname: Hostname to split.
    :type hostname: str

    :returns: Subdomain, domain and TLD.
    :rtype: tuple(str, str, str)
    """
    extract = TLDExtract(fetch = False)
    result  = extract( to_utf8(hostname) )
    return result.subdomain, result.domain, result.suffix


#------------------------------------------------------------------------------

Source File: __init__.py From recipe-scrapers with MIT License

5 votes

def get_domain(url):
    tldextract = TLDExtract(suffix_list_urls=None)
    url_info = tldextract(url)
    return "{}.{}".format(url_info.domain, url_info.suffix)

Source File: domaintools.py From metadoc with MIT License

5 votes

def get_domain(self, url):
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
    tld = no_fetch_extract(url)
    self.domain = "{}.{}".format(tld.domain, tld.suffix)

Source File: client.py From lexicon with MIT License

4 votes

def __init__(self, config=None):
        if not config:
            # If there is not config specified, we load a non-interactive configuration.
            self.config = non_interactive_config_resolver()
        elif not isinstance(config, ConfigResolver):
            # If config is not a ConfigResolver, we are in a legacy situation.
            # We protect this part of the Client API.
            self.config = legacy_config_resolver(config)
        else:
            self.config = config

        # Validate configuration
        self._validate_config()

        runtime_config = {}

        # Process domain, strip subdomain
        domain_extractor = tldextract.TLDExtract(cache_file=TLDEXTRACT_CACHE_FILE,
                                                 include_psl_private_domains=True)
        domain_parts = domain_extractor(
            self.config.resolve('lexicon:domain'))
        runtime_config['domain'] = '{0}.{1}'.format(
            domain_parts.domain, domain_parts.suffix)

        if self.config.resolve('lexicon:delegated'):
            # handle delegated domain
            delegated = self.config.resolve('lexicon:delegated').rstrip('.')
            if delegated != runtime_config.get('domain'):
                # convert to relative name
                if delegated.endswith(runtime_config.get('domain')):
                    delegated = delegated[:-len(runtime_config.get('domain'))]
                    delegated = delegated.rstrip('.')
                # update domain
                runtime_config['domain'] = '{0}.{1}'.format(
                    delegated, runtime_config.get('domain'))

        self.action = self.config.resolve('lexicon:action')
        self.provider_name = (self.config.resolve('lexicon:provider_name')
                              or self.config.resolve('lexicon:provider'))

        self.config.add_config_source(DictConfigSource(runtime_config), 0)

        provider_module = importlib.import_module(
            'lexicon.providers.' + self.provider_name)
        provider_class = getattr(provider_module, 'Provider')
        self.provider = provider_class(self.config)

Source File: BTG.py From BTG with GNU General Public License v3.0

4 votes

def extend_IOC(self, argument, observable_list):
        """
            Extending IOC from URL into URL + DOMAIN + IP
        """
        if config['offline']:
            # Cache search
            # TODO
            if "TLDE_cache" in config:
                cache_file = "%s%s" % (config['temporary_cache_path'], config['TLDE_cache'])
                cache_extract = tldextract.TLDExtract(cache_file=cache_file)
                extract = cache_extract(argument)
        else:
            # Live search
            no_cache_extract = tldextract.TLDExtract(cache_file=False)
            extract = no_cache_extract(argument)

        try:
            registered_domain = extract.registered_domain
        except:
            registered_domain = None
        try:
            suffix_domain = extract.suffix
        except:
            suffix_domain = None
        try:
            complete_domain = '.'.join(part for part in extract if part)
        except:
            complete_domain = None
        domains = [registered_domain, suffix_domain, complete_domain]

        IPs = [None, None, None]
        if not config["offline"]:
            for domain in domains:
                try:
                    IP = socket.gethostbyname(domain)
                except:
                    IP = None
                IPs.append(IP)

        for domain in domains:
            if domain is not None and domain not in observable_list:
                observable_list.append(domain)
        for IP in IPs:
            if IP is not None and IP not in observable_list:
                observable_list.append(IP)

Python tldextract.TLDExtract() Examples