Python Examples of tld.get

Source File: crawler.py From ITWSV with MIT License

7 votes

def __init__(self, response: Response, url: str, empty: bool = False):
        """Create a new Page object.

        @type response: Response
        @param response: a requests Response instance.

        @type url: str
        @param url: URL of the Page.

        @type empty: bool
        @param empty: whether the Page is empty (body length == 0)"""
        self._response = response
        self._url = url
        self._base = None
        self._soup = None
        self._is_empty = empty
        try:
            self._tld = get_tld(url)
        except TldDomainNotFound:
            self._tld = urlparse(url).netloc

Source File: buckethunter.py From AttackSurfaceMapper with GNU General Public License v3.0

6 votes

def passive_query(mswitch,hostx, key):
    keywords = get_tld(hostx.primary_domain, as_object=True, fail_silently=True, fix_protocol=True).domain

    if keywords is None:
        return

    if mswitch is True:
        print("[DEBUG] Keywords : ",keywords)

    par = {'access_token': key, 'keywords': keywords}
    try:
        response = requests.get("https://buckets.grayhatwarfare.com/api/v1/buckets", params=par, timeout=4)
        gwf_api = response.json()
        if gwf_api["buckets_count"] > 0:
            try:
                for bucket in gwf_api["buckets"]:
                    hostx.buckets.append(bucket["bucket"])
            except:
                pass

    except:
        cprint("error", "[*] Error: connecting with GrayHatWarfare API", 1)

Source File: AddOneLetter.py From squatm3 with GNU General Public License v3.0

6 votes

def add_one_letter(self):
        '''
            This function adds the same letter after the correct one
            tesla.com - ttesla.com - teesla.com - tessla.com - teslla.com - teslaa.com
        '''
        url = get_tld(self.url, as_object=True, fix_protocol=True)
        domain = url.domain

        new_urls_with_double_letter = []
        n = 0
        m = len(domain)
        while n < m:
            new_domain = domain[0:n] + domain[n] + domain[n] + domain[n+1:m]
            n = n + 1
            new_urls_with_double_letter.append(new_domain)

        return new_urls_with_double_letter

Source File: RemoveOneLetter.py From squatm3 with GNU General Public License v3.0

6 votes

def remove_letters(self):
        '''
            :return:
        '''
        url = get_tld(self.url, as_object=True, fix_protocol=True)
        domain = url.domain

        new_urls_without_letter = []
        n = 0
        m = len(domain)
        while n < m:
            new_domain = domain[0:n] + domain[n+1:m]
            n = n + 1
            new_urls_without_letter.append(new_domain)
        new_urls_list = list(set(new_urls_without_letter))
        return new_urls_list

Source File: HomoglyphAttack2.py From squatm3 with GNU General Public License v3.0

6 votes

def switch_all_letters(self):
        """
        The following function generates all the possible combinations using homoglyphs

        """
        url = get_tld(self.url, as_object=True, fix_protocol=True)
        domain = url.domain
        domains = hg.Homoglyphs().get_combinations(domain)
        a = []
        i = 0
        print("Generated " + str(len(domains)) + " domains\n")
        for domain in domains:
            idna_domain = domain.encode('idna').decode('idna')

            if not a.__contains__(idna_domain):
                a.append(domain.encode('idna').decode('idna'))
            i = i+1
            print(str(i) + ' out of ' + str(len(domains)) + ' domains: ' + str(len(a)))
        return a

Source File: medium.py From mma-dexter with Apache License 2.0

6 votes

def is_tld_exception(cls, url):
        """ Test if the url falls within one of the exceptions, 
        this is intended to handle instances where get_tld() 
        calls fail to recognise urls (eg: .co.tz fials...)
        """
        url_exceptions = [
            'thecitizen.co.tz',
            'dailynews.co.tz',
            'nigeriatoday.ng',
            'nta.ng',
            'nan.ng',
            'leadership.ng',
            'independent.ng',
            'guardian.ng',
            'dailytimes.ng',
            'theinterview.ng',
            'city-press.news24.com'
        ]
        for ex in url_exceptions: 
            # check if it exists in the url add buffer for [https://www.] characters at start
            if ex in url[:len(ex)+12]:
                return ex
        
        return None

Source File: hook.py From letsencrypt-rackspace-hook with Apache License 2.0

6 votes

def delete_txt_record(args):
    """
    Clean up the TXT record when it is no longer needed.

    Keyword arguments
    args -- passed from letsencrypt.sh
    """
    domain_name = args[0]
    base_domain_name = get_tld("http://{0}".format(domain_name))
    domain = get_domain(base_domain_name)

    # Get the DNS record object(s) for our challenge record(s)
    name = "{0}.{1}".format('_acme-challenge', domain_name)
    dns_records = list(rax_dns.get_record_iterator(domain))
    text_records = [x for x in dns_records if x.type == 'TXT']

    # Delete any matching records we find
    for text_record in text_records:
        if text_record.name == name:
            text_record.delete()

    return True

Source File: datasploit.py From datasploit with GNU General Public License v3.0

6 votes

def auto_select_target(target, output=None):
    """Auto selection logic"""
    print "Target: %s" % target
    try:
        inp=IPAddress(target);
        if inp.is_private() or inp.is_loopback():
            print "Internal IP Detected : Skipping"
            sys.exit()
        else:
            print "Looks like an IP, running ipOsint...\n"
            ipOsint.run(target, output)
    except SystemExit:
        print "exiting"
    except AddrFormatError:
        if re.match('[^@]+@[^@]+\.[^@]+', target):
            print "Looks like an EMAIL, running emailOsint...\n"
            emailOsint.run(target, output)
        elif get_tld(target, fix_protocol=True,fail_silently=True) is not None:
            print "Looks like a DOMAIN, running domainOsint...\n"
            domainOsint.run(target, output)
        else:
            print "Nothing Matched assuming username, running usernameOsint...\n"
            usernameOsint.run(target, output)
    except:
        print "Unknown Error Occured"

Source File: get_links3.py From Malicious_Domain_Whois with GNU General Public License v3.0

6 votes

def judge_a_links():
	global white_list
	global balck_list
	global soup_q
	global res_q
	while True:
		try:
			domain, soup = soup_q.get(timeout=50)
		except Queue.Empty:
			break
		mal_urls = []
		for a in soup.find_all('a'):
			try:
				url = a['href']
				url_domain = get_tld(url)
				if url_domain != domain:  # 获取除本网站站内连接之外的链接
					if url_domain in black_list or url_domain not in white_list:
					# 目前先将不在白名单中域名url都放入malicious_link表内，待以后malicious_link表足够完全后，再只用黑名单
						mal_urls.append((url, url_domain))
			except Exception, e:
				# logger.info(domain + '   GET LINKS WRONG ...')
				continue
		res_q.put([domain, mal_urls])

Source File: crawler.py From ITWSV with MIT License

6 votes

def is_in_scope(self, resource):
        if isinstance(resource, web.Request):
            if self._scope == Scope.FOLDER:
                return resource.url.startswith(self._base.path)
            elif self._scope == Scope.PAGE:
                return resource.path == self._base.path
            elif self._scope == Scope.URL:
                return resource.url == self._base.url
            else:  # Scope.DOMAIN
                try:
                    return get_tld(resource.url) == get_tld(self._base.url)
                except TldDomainNotFound:
                    return resource.hostname == self._base.hostname
        else:
            if self._scope == Scope.FOLDER:
                return resource.startswith(self._base.path)
            elif self._scope == Scope.PAGE:
                return resource.split("?")[0] == self._base.path
            elif self._scope == Scope.URL:
                return resource == self._base.url
            else:  # Scope.DOMAIN
                try:
                    return get_tld(resource) == get_tld(self._base.url)
                except TldDomainNotFound:
                    return urlparse(resource).netloc == self._base.hostname

Source File: crawler.py From ITWSV with MIT License

6 votes

def is_external_to_domain(self, url: str) -> bool:
        """Returns True if url is under another TLD than the crawled URL, False otherwise.

        @type url: str
        @param url: An absolute URL (with protocol prefix)
        @rtype: bool
        """
        try:
            tld = get_tld(url)
        except TldDomainNotFound:
            # Not yet known TLD or IP address or local hostname
            tld = urlparse(url).netloc
        except TldBadUrl:
            tld = None
            print("bad url", url, "found within", self._url)
        return tld != self._tld

Source File: medium.py From mma-dexter with Apache License 2.0

5 votes

def for_url(cls, url):
        domain = get_tld(url, fail_silently=True)
        # fail silently
        
        if domain is None:
            domain = cls.is_tld_exception(url)
        
        if domain is None:
            return None

        parts = urlparse(url)

        # iol.co.za/isolezwe
        domain = domain + parts.path

        # explicitly look for city-press, subdomain does not play nice with current Dexter code
        if 'city-press.news24.com' in url:
            medium = Medium.query.get(5)
            return medium

        else:
        # find the medium with the longest matching domain
            for medium in sorted(Medium.query.all(), key=lambda m: len(m.domain or ''), reverse=True):
                if medium.domain and domain.startswith(medium.domain):
                    return medium

        return None

Source File: hook.py From letsencrypt-cloudflare-hook with MIT License

5 votes

def _get_zone_id(domain):
    tld = get_tld('http://' + domain)
    url = "https://api.cloudflare.com/client/v4/zones?name={0}".format(tld)
    r = requests.get(url, headers=CF_HEADERS)
    r.raise_for_status()
    return r.json()['result'][0]['id']


# https://api.cloudflare.com/#dns-records-for-a-zone-dns-record-details

Source File: domain.py From memex-explorer with BSD 2-Clause "Simplified" License

5 votes

def extract_tld(self, url):
        try:
            return get_tld(url)
        except:
            traceback.print_exc()
            print "\n\nInvalid url: %s" % url
            return url

Source File: get_links.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def get_a_links(url, white_list):
	try:
		source_domain = get_tld(url)
	except Exception, e:
		# print str(e)
		return {}

Source File: commons.py From Analyst-Arsenal with GNU General Public License v3.0

5 votes

def score_domain(config, domain, args):
    """ """
    score = 0

    for t in config["tlds"]:
        if domain.endswith(t):
            score += 20

    try:
        res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True)

        if res is not None:
            domain = '.'.join([res.subdomain, res.domain])
    except Exception as err:
        message_failed(args, err, domain)
        pass

    score += int(round(entropy.shannon_entropy(domain)*50))

    domain          = unconfuse(domain)
    words_in_domain = re.split(r"\W+", domain)

    if words_in_domain[0] in ["com", "net", "org"]:
        score += 10

    for word in config["keywords"]:
        if word in domain:
            score += config["keywords"][word]

    for key in [k for (k,s) in config["keywords"].items() if s >= 70]:
        for word in [w for w in words_in_domain if w not in ["email", "mail", "cloud"]]:
            if distance(str(word), str(key)) == 1:
                score += 70

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3

    if domain.count(".") >= 3:
        score += domain.count(".") * 3
    return score

Source File: get_domain.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def get_domain():
	global url_q
	while True:
		if url_q.empty():
			break
		url = url_q.get()
		try:
			domain = str(get_tld(url))
		except:
			logger.info(url + 'get domian wrong ...')
			continue
		res_q.put([url, domain])
	print 'get domains over ...'

Source File: bounty-monitor.py From bounty-monitor with MIT License

5 votes

def update_subdomain(subdomain, alive):
    """Subdomain database is maintained locally to keep track of identified live and known subdomains."""
    tld = get_tld(subdomain, as_object=True, fail_silently=True, fix_protocol=True)
    try:
        #synchronize multithread DB_CURSOR.execute
        LOCK.acquire(True)
        if alive == "N":
            DB_CURSOR.execute("insert into subdomains(subdomain, domain, first_found, alive, source) values(?, ?, ?, ?, ?)", (subdomain, tld.tld, datetime.now(), 0, "BountyMonitor"))
            CONNECTION.commit()
        elif alive == "Y":
            DB_CURSOR.execute("update subdomains set alive=1 where subdomain = ?", (subdomain, ))
            CONNECTION.commit()
    finally:
        LOCK.release()

Source File: bounty-monitor.py From bounty-monitor with MIT License

5 votes

def monitor(message, context):
    """certstream events callback handler"""

    all_domains = ""
    if message['message_type'] == "heartbeat":
        return

    if message["message_type"] == "certificate_update":
        all_domains = message["data"]["leaf_cert"]["all_domains"]
        
    for domain in set(all_domains):
        PBAR.update(1)

        # all magic happens here
        try:
            if domain.count(".") > 1 and not domain.startswith("*.") and not re.search("\d$", domain) and "cloudflaressl" not in domain and "xn--" not in domain and not domain.endswith("local"):
                tld = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True)
                if tld is not None and tld.tld in BOUNTY_LIST and tld.tld != domain and tld.subdomain != "www":
                    if check_subdomain_not_known_in_db(domain):
                        update_subdomain(domain, "N")
                        MONITOR_QUEUE.put(domain)
        except Exception as e:
            logging.exception("message")
            print (domain)

    t.sleep(.1)

Source File: Flipper.py From squatm3 with GNU General Public License v3.0

5 votes

def flip_letters(self):
        '''
        The following function

        '''

        url = get_tld(self.url, as_object=True, fix_protocol=True)
        domain = url.domain
        new_urls_without_letter = []
        n = 0
        m = len(domain)

        

        if m == 1:
            new_urls_without_letter.append(domain)
        elif m == 2:
            new_domain = domain[1] + domain[0]
            new_urls_without_letter.append(new_domain)

        else:

            while n < m and m > 2:

                if n == 0 :
                    new_domain = domain[n + 1] + domain[n] + domain[n + 2:m]

                elif n == 1:
                    new_domain = domain[0] + domain[n + 1] + domain[n] + domain[n + 2:m]

                elif 1 < n < m - 1:
                    new_domain = domain[0:n] + domain[n + 1] + domain[n] + domain[n + 2:m]

                n = n + 1
                new_urls_without_letter.append(new_domain)
        new_urls_list = list(set(new_urls_without_letter))
        return new_urls_list

Source File: dns_oa.py From incubator-spot with Apache License 2.0

5 votes

def _add_tld_column(self):
        qry_name_col = self._conf['dns_results_fields']['dns_qry_name'] 
        self._dns_scores = [conn + [ get_tld("http://" + str(conn[qry_name_col]), fail_silently=True) if "http://" not in str(conn[qry_name_col]) else get_tld(str(conn[qry_name_col]), fail_silently=True)] for conn in self._dns_scores ]

Source File: hook.py From letsencrypt-rackspace-hook with Apache License 2.0

5 votes

def get_domain(domain_name):
    """
    Query the Rackspace DNS API to get a domain object for the domain name.

    Keyword arguments:
    domain_name -- the domain name that needs a challenge record
    """
    base_domain_name = get_tld("http://{0}".format(domain_name))
    domain = rax_dns.find(name=base_domain_name)
    return domain

Source File: wpad.py From pypac with Apache License 2.0

5 votes

def proxy_urls_from_dns(local_hostname=None):
    """
    Generate URLs from which to look for a PAC file, based on a hostname.
    Fully-qualified hostnames are checked against the Mozilla Public Suffix List to ensure that
    generated URLs don't go outside the scope of the organization.
    If the fully-qualified hostname doesn't have a recognized TLD,
    such as in the case of intranets with '.local' or '.internal',
    the TLD is assumed to be the part following the rightmost dot.

    :param str local_hostname: Hostname to use for generating the WPAD URLs.
        If not provided, the local hostname is used.
    :return: PAC URLs to try in order, according to the WPAD protocol.
        If the hostname isn't qualified or is otherwise invalid, an empty list is returned.
    :rtype: list[str]
    """
    if not local_hostname:
        local_hostname = socket.getfqdn()
    if '.' not in local_hostname or len(local_hostname) < 3 or \
            local_hostname.startswith('.') or local_hostname.endswith('.'):
        return []
    try:
        parsed = get_tld('http://' + local_hostname, as_object=True)
        subdomain, tld = parsed.subdomain, parsed.fld
    except TldDomainNotFound:
        final_dot_index = local_hostname.rfind('.')
        subdomain, tld = local_hostname[0:final_dot_index], local_hostname[final_dot_index+1:]
    return wpad_search_urls(subdomain, tld)

Source File: get_whois.py From armory with GNU General Public License v3.0

5 votes

def run(domains):
    whois_domains = {}
    if type(domains) == str:
        domains = [domains]

    for domain in domains:
        tld = get_tld("blah://%s" % domain)

        if whois_domains.get(tld, False):
            whois_domains[tld]["subdomains"].append(domain.lower())
        else:
            whois_domains[tld] = {"subdomains": [domain.lower()]}

    return whois_domains

Source File: utils.py From Photon with GNU General Public License v3.0

5 votes

def top_level(url, fix_protocol=True):
    """Extract the top level domain from an URL."""
    ext = tld.get_tld(url, fix_protocol=fix_protocol)
    toplevel = '.'.join(urlparse(url).netloc.split('.')[-2:]).split(
        ext)[0] + ext
    return toplevel

Source File: subdomain_brute.py From butian-src-domains with GNU General Public License v3.0

5 votes

def update_subdomains(update_all=False):
    targets_list = load_all_targets()
    target_with_subdomains = load_target_with_subdomains()
    # targets_left_path = os.path.join(SUBS_DIR,'targets_left.txt')
    # with open(targets_left_path,'w') as f:
    #     for target in targets_list:
    #         if target['domain'] not in target_with_subdomains:
    #             f.write(json.dumps(target, ensure_ascii=False)+'\n')
    update_count = 0
    all_count = len(targets_list)
    with open(target_with_subdomains_path,'a') as f:
        for target in targets_list:
            try:
                update_count += 1
                #get domain like: get aiyo.xyz from www.aiyo.xyz
                target_domain = target['domain']
                if not update_all and target_domain in target_with_subdomains:
                    # logger.warning('Already brute subdomain for {}, skip'.format(target_domain))
                    continue
                if not target_domain.startswith('http'):
                    target_domain = 'http://' + target_domain
                res = get_tld(target_domain, as_object=True, fail_silently=True)
                if res and res.fld:
                    subdomains = get_subdomains(res.fld)
                    if subdomains == None:
                        continue
                    target['subdomains'] = subdomains
                    #append new result
                    f.write(json.dumps(target, ensure_ascii=False)+'\n')
                    logger.info('add subdomains result for {} success, progress {}/{}'.format(target_domain, update_count, all_count))
            except Exception:
                logger.error('Unexpected error occured when update subdomains for {}'.format(target_domain))

Source File: phishing_catcher.py From OSweep with MIT License

5 votes

def score_domain(provided_ioc):
    """Return the scores of the provided domain."""
    score = 0

    for suspicious_tld in suspicious["tlds"]:
        if provided_ioc.endswith(suspicious_tld):
            score += 20

    try:
        res    = tld.get_tld(provided_ioc, as_object=True, fail_silently=True,
                             fix_protocol=True)
        domain = ".".join([res.subdomain, res.domain])
    except Exception:
        domain = provided_ioc

    score += int(round(entropy.shannon_entropy(domain)*50))
    domain = confusables.unconfuse(domain)
    words_in_domain = re.split("\W+", domain)


    if domain.startswith("*."):
        domain = domain[2:]

        if words_in_domain[0] in ["com", "net", "org"]:
            score += 10

    for word in suspicious["keywords"]:
        if word in domain:
            score += suspicious["keywords"][word]

    for key in [k for k, v in suspicious["keywords"].items() if v >= 70]:
        for word in [w for w in words_in_domain if w not in ["email", "mail", "cloud"]]:
            if pylev.levenshtein(str(word), str(key)) == 1:
                score += 70

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3

    if domain.count(".") >= 3:
        score += domain.count(".") * 3
    return score

Source File: HomoglyphAttack.py From squatm3 with GNU General Public License v3.0

4 votes

def switch_all_letters(self):
        """
        The following function generates all the possible combinations using homoglyphs

        """
        domains = []
        url = get_tld(self.url, as_object=True, fix_protocol=True)
        domain = url.domain
        a = []
        j = 0
        glyphs = self.dictionary
        result1 = set()

        for ws in range(1, len(domain)):
            for i in range(0, (len(domain)-ws)+1):
                win = domain[i:i+ws]
                j = 0
                while j < ws:
                    c = win[j]
                    if c in glyphs:
                        win_copy = win
                        for g in glyphs[c]:
                            win = win.replace(c, g)
                            result1.add(domain[:i] + win + domain[i+ws:])
                            win = win_copy
                    j += 1

        result2 = set()

        for domain in result1:
            for ws in range(1, len(domain)):
                for i in range(0, (len(domain)-ws)+1):
                    win = domain[i:i+ws]
                    j = 0
                    while j < ws:
                        c = win[j]
                        if c in glyphs:
                            win_copy = win
                            for g in glyphs[c]:
                                win = win.replace(c, g)
                                result2.add(domain[:i] + win + domain[i+ws:])
                                win = win_copy
                        j += 1

        return list(result1 | result2)

Source File: catch_phishing.py From phishing_catcher with GNU General Public License v3.0

4 votes

def score_domain(domain):
    """Score `domain`.

    The highest score, the most probable `domain` is a phishing site.

    Args:
        domain (str): the domain to check.

    Returns:
        int: the score of `domain`.
    """
    score = 0
    for t in suspicious['tlds']:
        if domain.endswith(t):
            score += 20

    # Remove initial '*.' for wildcard certificates bug
    if domain.startswith('*.'):
        domain = domain[2:]

    # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com)
    try:
        res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True)
        domain = '.'.join([res.subdomain, res.domain])
    except Exception:
        pass

    # Higer entropy is kind of suspicious
    score += int(round(entropy(domain)*10))

    # Remove lookalike characters using list from http://www.unicode.org/reports/tr39
    domain = unconfuse(domain)

    words_in_domain = re.split("\W+", domain)

    # ie. detect fake .com (ie. *.com-account-management.info)
    if words_in_domain[0] in ['com', 'net', 'org']:
        score += 10

    # Testing keywords
    for word in suspicious['keywords']:
        if word in domain:
            score += suspicious['keywords'][word]

    # Testing Levenshtein distance for strong keywords (>= 70 points) (ie. paypol)
    for key in [k for (k,s) in suspicious['keywords'].items() if s >= 70]:
        # Removing too generic keywords (ie. mail.domain.com)
        for word in [w for w in words_in_domain if w not in ['email', 'mail', 'cloud']]:
            if distance(str(word), str(key)) == 1:
                score += 70

    # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com)
    if 'xn--' not in domain and domain.count('-') >= 4:
        score += domain.count('-') * 3

    # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq)
    if domain.count('.') >= 3:
        score += domain.count('.') * 3

    return score

Source File: subdom.py From Vaile with GNU General Public License v3.0

4 votes

def subdombrute(web):

    try:
        print(GR+' [*] Importing wordlist path to be bruteforced... "files/subdomains.lst"')
        with open('files/subdomains.lst','r') as lol:
            for path in lol:
                a = path.replace("\n","")
                sublist.append(a)

    except IOError:
        print(R+' [-] Wordlist not found!')

    global found
    if 'http://' in web:
        web = web.replace('http://','')
    elif 'https://' in web:
        web = web.replace('https://','')
    else:
        pass

    web = 'http://' + web

    tld0 = get_tld(web, as_object=True)

    if len(sublist) > 0:
        for m in sublist:
            furl = str(m) + '.' + str(tld0)
            flist.append(furl)

    if flist:
        time.sleep(0.5)
        print(R+'\n      B R U T E F O R C E R')
        print(R+'     =======================\n')
        print(GR+' [*] Bruteforcing for possible subdomains...')
        for url in flist:
            if 'http://' in url:
                url = url.replace('http://','')
            elif 'https://' in url:
                url = url.replace('https://','')
            else:
                pass
            try:
                ip = socket.gethostbyname(url)
                print(G+'\n [!] Subdomain Found : '+O+url+P+' ['+str(ip)+']')
                found.append(url)
            except:
                sys.stdout.write(B+'\r [*] Checking : '+C+url)
                sys.stdout.flush()
    return found

Python tld.get_tld() Examples