Python Examples of tldextract.extract

Source File: SubDomainizer.py From SubDomainizer with MIT License

6 votes

def getDomain(url):
    """

    This function will get top level domain from given URL.

    Parameters
    -------
    url: str
        Original URL provided in the argument.

    Returns
    --------
    str
        top level domain will be returned.
    """
    if urlparse(url).netloc != '':
        finalset.add(urlparse(url).netloc)
    ext = tldextract.extract(str(url))
    return ext.registered_domain

Source File: string_utils.py From ph0neutria with Apache License 2.0

6 votes

def remove_tld(domain):
    """Remove the TLD from a domain name.

    Params:
    - domain: (type: string) FQDN.

    Returns:
    - domain: (type: string) FQDN without TLD.
    """
    try:
        tld = extract(domain).suffix
        domain = ''.join(domain.rsplit(tld, 1)).strip('.')

    except Exception as e:
        LOGGING.warning(
            'Error stripping TLD ({0}): {1}'.format(
                domain, str(e)))

    return domain

Source File: crawler.py From FinalRecon with MIT License

6 votes

def external_links(target):
	global total, ext_total
	print(G + '[+]' + C + ' Extracting External Links' + W, end = '')

	ext = tldextract.extract(target)
	domain = ext.registered_domain

	links = soup.find_all('a')
	for link in links:
		url = link.get('href')
		if url != None:
			if domain not in url and 'http' in url:
				ext_total.append(url)

	ext_total = set(ext_total)
	print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(ext_total))))

Source File: crawler.py From FinalRecon with MIT License

6 votes

def internal_links(target):
	global total, int_total
	print(G + '[+]' + C + ' Extracting Internal Links' + W, end = '')

	ext = tldextract.extract(target)
	domain = ext.registered_domain

	links = soup.find_all('a')
	for link in links:
		url = link.get('href')
		if url != None:
			if domain in url:
				int_total.append(url)

	int_total = set(int_total)
	print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(int_total))))

Source File: kostebek.py From kostebek with GNU General Public License v3.0

6 votes

def getGoogleDomains(self):

		googleList = []
		file = "results/"+self.org+"/google.txt"
		if os.path.exists(file):
			fh = open(file)
			for line in fh:	
				extracted = tldextract.extract(line)
				tld = extracted.domain+"."+extracted.suffix
				if tld not in googleList:
					googleList.append(tld)
			for domain in googleList:
				print(domain)
			fh.close() 
		else:
			print("Not found")
			exit()

Source File: kostebek.py From kostebek with GNU General Public License v3.0

6 votes

def getBingDomains(self):

		googleList = []
		file = "results/"+self.org+"/bing.txt"
		if os.path.exists(file):
			fh = open(file)
			for line in fh:	
				extracted = tldextract.extract(line)
				tld = extracted.domain+"."+extracted.suffix
				if tld not in googleList:
					googleList.append(tld)
			for domain in googleList:
				print(domain)
			fh.close()
		else:
			print("Not found")
			exit()

Source File: kostebek.py From kostebek with GNU General Public License v3.0

6 votes

def getYahooDomains(self):

		googleList = []
		file = "results/"+self.org+"/yahoo.txt"
		if os.path.exists(file):
			fh = open(file)
			for line in fh:	
				extracted = tldextract.extract(line)
				tld = extracted.domain+"."+extracted.suffix
				if tld not in googleList:
					googleList.append(tld)
			for domain in googleList:
				print(domain)
			fh.close()
		else:
			print("Not found")
			exit()

Source File: URLHelper.py From WSPIH with MIT License

6 votes

def get_subdomain(url):
        """Get the subdomain of the given URL.

        Args:
            url (str): The URL to get the subdomain from.

        Returns:
            str: The subdomain(s)

        """

        if url not in URLHelper.__cache:
            URLHelper.__cache[url] = urlparse(url)

        # return (tldextract.extract(url).subdomain)
        return ".".join(URLHelper.__cache[url].netloc.split(".")[:-2])

Source File: URLHelper.py From WSPIH with MIT License

6 votes

def get_hostname(url):
        """Get the hostname of the given URL.

        Args:
            url (str): The URL to get the hostname from.

        Returns:
            str: The hostname

        """

        if url not in URLHelper.__cache:
            URLHelper.__cache[url] = urlparse(url)

        parts = URLHelper.__cache[url].netloc.split(".")

        if len(parts) == 1:
            return parts[0]
        else:
            return ".".join(parts[-2:-1])
        # return (tldextract.extract(url).domain)

Source File: utils.py From open-syllabus-project with Apache License 2.0

6 votes

def parse_domain(url):

    """
    Extract a domain from a URL.

    Args:
        url (str)

    Returns: str
    """

    url = url.lower().strip()

    # Get the last `http://...` sequence.
    url = re.compile('http[s]?:/{1,2}').split(url)[-1]

    tld = tldextract.extract(url)

    return tld.registered_domain

Source File: cleanup.py From cccatalog-api with MIT License

6 votes

def cleanup_url(url, tls_support):
        """
        Add protocols to the URI if they are missing, else return None.
        """
        parsed = urlparse(url)
        if parsed.scheme == '':
            _tld = tldextract.extract(url)
            _tld = f'{_tld.subdomain}.{_tld.domain}.{_tld.suffix}'
            try:
                tls_supported = tls_support[_tld]
            except KeyError:
                tls_supported = TlsTest.test_tls_supported(url)
                tls_support[_tld] = tls_supported
                log.info('Tested domain {}'.format(_tld))

            if tls_supported:
                return "'https://{}'".format(url)
            else:
                return "'http://{}'".format(url)
        else:
            return None

Source File: __main__.py From altdns with Apache License 2.0

6 votes

def join_words_subdomains(args, alteration_words):
    with open(args.input, "r") as fp:
        with open(args.output_tmp, "a") as wp:
            for line in fp:
                ext = tldextract.extract(line.strip())
                current_sub = ext.subdomain.split(".")
                for word in alteration_words:
                    for index, value in enumerate(current_sub):
                        original_sub = current_sub[index]
                        current_sub[index] = current_sub[index] + word.strip()
                        # join the list to make into actual subdomain (aa.bb.cc)
                        actual_sub = ".".join(current_sub)
                        # save full URL as line in file
                        full_url = "{0}.{1}.{2}\n".format(
                            actual_sub, ext.domain, ext.suffix)
                        write_domain(args, wp, full_url)
                        current_sub[index] = original_sub
                        # second dash alteration
                        current_sub[index] = word.strip() + current_sub[index]
                        actual_sub = ".".join(current_sub)
                        # save second full URL as line in file
                        full_url = "{0}.{1}.{2}\n".format(
                            actual_sub, ext.domain, ext.suffix)
                        write_domain(args, wp, full_url)
                        current_sub[index] = original_sub

Source File: requestium.py From requestium with BSD 3-Clause "New" or "Revised" License

6 votes

def transfer_session_cookies_to_driver(self, domain=None):
        """Copies the Session's cookies into the webdriver

        Using the 'domain' parameter we choose the cookies we wish to transfer, we only
        transfer the cookies which belong to that domain. The domain defaults to our last visited
        site if not provided.
        """
        if not domain and self._last_requests_url:
            domain = tldextract.extract(self._last_requests_url).registered_domain
        elif not domain and not self._last_requests_url:
            raise Exception('Trying to transfer cookies to selenium without specifying a domain '
                            'and without having visited any page in the current session')

        # Transfer cookies
        for c in [c for c in self.cookies if domain in c.domain]:
            self.driver.ensure_add_cookie({'name': c.name, 'value': c.value, 'path': c.path,
                                           'expiry': c.expires, 'domain': c.domain})

Source File: textspider.py From ARGUS with GNU General Public License v3.0

6 votes

def subdomainGetter(self, response):
        #if string
        if isinstance(response, str):
            tld = tldextract.extract(response)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain            
        #if scrapy response object
        else:
            tld = tldextract.extract(response.url)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain
        
    #function which checks if there has been a redirect from the starting url

Source File: linkspider.py From ARGUS with GNU General Public License v3.0

6 votes

def subdomainGetter(self, response):
        #if string
        if isinstance(response, str):
            tld = tldextract.extract(response)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain            
        #if scrapy response object
        else:
            tld = tldextract.extract(response.url)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain
        
    #function which checks if there has been a redirect from the starting url

Source File: textspider.py From ARGUS with GNU General Public License v3.0

6 votes

def extractText(self, response):
        text = []
        text.append(["p", [" ".join(response.xpath("//p/text()").extract())]]) # paragraph
        text.append(["div", [" ".join(response.xpath("//div/text()").extract())]]) # division
        text.append(["tr", [" ".join(response.xpath("//tr/text()").extract())]]) # table row
        text.append(["td", [" ".join(response.xpath("//td/text()").extract())]]) # table data
        text.append(["th", [" ".join(response.xpath("//th/text()").extract())]]) # table header
        text.append(["font", [" ".join(response.xpath("//font/text()").extract())]]) # font size, css should be used (only relevant for old websites)
        text.append(["li", [" ".join(response.xpath("//li/text()").extract())]]) # list item
        text.append(["small", [" ".join(response.xpath("//small/text()").extract())]]) # barely emphasized text
        text.append(["strong", [" ".join(response.xpath("//strong/text()").extract())]]) # strongly emphasized text
        text.append(["h1", [" ".join(response.xpath("//h1/text()").extract())]]) # header
        text.append(["h2", [" ".join(response.xpath("//h2/text()").extract())]]) # header 
        text.append(["h3", [" ".join(response.xpath("//h3/text()").extract())]]) # header
        text.append(["h4", [" ".join(response.xpath("//h4/text()").extract())]]) # header
        text.append(["h5", [" ".join(response.xpath("//h5/text()").extract())]]) # header
        text.append(["h6", [" ".join(response.xpath("//h6/text()").extract())]]) # header
        text.append(["span", [" ".join(response.xpath("//span/text()").extract())]]) # division for styling
        text.append(["b", [" ".join(response.xpath("//b/text()").extract())]]) # bold text
        text.append(["em", [" ".join(response.xpath("//em/text()").extract())]]) # emphasized text
        return text

    #function which extracts and returs meta information

Source File: subdomain_brute.py From butian-src-domains with GNU General Public License v3.0

6 votes

def get_subdomains(domain):
    #validate domain
    if domain:
        p = domain.strip().lower()
        re_domain = re.findall(r'^(([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,})$', p)
        if len(re_domain) > 0 and re_domain[0][0] == p and tldextract.extract(p).suffix != '':
            pass
        else:
            logger.error('Domain validation failed: {d}'.format(d=p))
    else:
        logger.warning('domain is empty')
        return

    try:
        esd = EnumSubDomain(domain)
        return esd.run()
    except Exception:
        logger.error('Unexpected error occured when brute subdomain for {}'.format(domain),exc_info=True)

Source File: crawl_bot.py From osint-suite-tools with GNU General Public License v3.0

6 votes

def collect_url(web_url):
        html_data_string = ''
        try:
            received_response = urlopen(web_url)
            if 'text/html' in received_response.getheader('Content-Type'):
                data_bytes = received_response.read()
                html_data_string = data_bytes.decode("latin-1")
            link_finder = link_crawler(Crawl_bot.start_link, web_url)
            link_finder.feed(html_data_string)

##############################################################################################################################################################################################
#######################################FOR SCRAPPING PURPOSES#################################################################################################################################
            f = open(Crawl_bot.folder_name + '/' + ((tldextract.extract(web_url)).domain), 'w')
            f.write(html_data_string)
            f.close()
###############################################################################################################################################################################################
###############################################################################################################################################################################################

        except Exception as e:
            print(str(e))
            return set()
        return link_finder.page_urls()

Source File: textspider.py From ARGUS with GNU General Public License v3.0

6 votes

def subdomainGetter(self, response):
        #if string
        if isinstance(response, str):
            tld = tldextract.extract(response)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain
        #if scrapy response object
        else:
            tld = tldextract.extract(response.url)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain
        
    #function which checks if there has been a redirect from the starting url

Source File: linkspider.py From ARGUS with GNU General Public License v3.0

6 votes

def subdomainGetter(self, response):
        #if string
        if isinstance(response, str):
            tld = tldextract.extract(response)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain            
        #if scrapy response object
        else:
            tld = tldextract.extract(response.url)
            if tld.subdomain != "":
                domain = tld.subdomain + "." + tld.registered_domain
                return domain
            else:
                domain = tld.registered_domain
                return domain
        
    #function which checks if there has been a redirect from the starting url

Source File: textspider.py From ARGUS with GNU General Public License v3.0

6 votes

def extractText(self, response):
        text = []
        text.append(["p", [" ".join(response.xpath("//p/text()").extract())]])
        text.append(["div", [" ".join(response.xpath("//div/text()").extract())]])
        text.append(["tr", [" ".join(response.xpath("//tr/text()").extract())]])
        text.append(["td", [" ".join(response.xpath("//td/text()").extract())]])
        text.append(["font", [" ".join(response.xpath("//font/text()").extract())]])
        text.append(["li", [" ".join(response.xpath("//li/text()").extract())]])
        text.append(["small", [" ".join(response.xpath("//small/text()").extract())]])
        text.append(["strong", [" ".join(response.xpath("//strong/text()").extract())]])
        text.append(["h1", [" ".join(response.xpath("//h1/text()").extract())]])
        text.append(["h2", [" ".join(response.xpath("//h2/text()").extract())]])
        text.append(["h3", [" ".join(response.xpath("//h3/text()").extract())]])
        text.append(["h4", [" ".join(response.xpath("//h4/text()").extract())]])
        text.append(["h5", [" ".join(response.xpath("//h5/text()").extract())]])
        text.append(["h6", [" ".join(response.xpath("//h6/text()").extract())]])
        text.append(["span", [" ".join(response.xpath("//span/text()").extract())]])
        text.append(["b", [" ".join(response.xpath("//b/text()").extract())]])
        text.append(["em", [" ".join(response.xpath("//em/text()").extract())]])
        
        return text
    
    #function which reorders the urlstack, giving highest priority to short urls and language tagged urls

Source File: BuscadorPersonas.py From osint-suite-tools with GNU General Public License v3.0

6 votes

def search_google_(target):
    engine = Google()
    results = engine.search("'" + target + "'")
    for r in results:
        print ("|--[INFO][GOOGLE][RESULTS][>] " + r["title"] + " | " + r["text"] + " | " + r["link"])
        
        try:
            tsd, td, tsu = extract(r["link"])
            domain = td + '.' + tsu

            web = requests.get(r["link"], timeout=3)
            print ("|----[INFO][WEB][HTTP CODE][>] " + str(web.status_code) + "\n")

            if web.status_code >= 200 or web.status_code < 300:

                if not domain in config.BL_parserPhone:
                    TEXT = er.remove_tags(str(web.text))
                    parser.parserMAIN(TEXT)

        except Exception as e:
            print ("|----[ERROR][HTTP CONNECTION][>] " + str(e))

Source File: shared.py From quay with Apache License 2.0

6 votes

def can_create_user(email_address, blacklisted_domains=None):
    """
    Returns true if a user with the specified e-mail address can be created.
    """

    if features.BLACKLISTED_EMAILS and email_address and "@" in email_address:
        blacklisted_domains = blacklisted_domains or []
        _, email_domain = email_address.split("@", 1)
        extracted = tldextract.extract(email_domain)
        if extracted.registered_domain.lower() in blacklisted_domains:
            return False

    if not features.USER_CREATION:
        return False

    if features.INVITE_ONLY_USER_CREATION:
        if not email_address:
            return False

        # Check to see that there is an invite for the e-mail address.
        return bool(model.team.lookup_team_invites_by_email(email_address))

    # Otherwise the user can be created (assuming it doesn't already exist, of course)
    return True

Source File: dnsgen.py From dnsgen with MIT License

5 votes

def partiate_domain(domain):
	'''
	Split domain base on subdomain levels.
	Root+TLD is taken as one part, regardless of its levels (example.co.uk, example.com, ...)
	'''

	# test.1.foo.example.com -> [test, 1, foo, example.com]
	# test.2.foo.example.com.cn -> [test, 2, foo, example.com.cn]
	# test.example.co.uk -> [test, example.co.uk]

	ext = tldextract.extract(domain.lower())
	parts = (ext.subdomain.split('.') + [ext.registered_domain])

	return parts

Source File: __main__.py From altdns with Apache License 2.0

5 votes

def insert_dash_subdomains(args, alteration_words):
    with open(args.input, "r") as fp:
        with open(args.output_tmp, "a") as wp:
            for line in fp:
                ext = tldextract.extract(line.strip())
                current_sub = ext.subdomain.split(".")
                for word in alteration_words:
                    for index, value in enumerate(current_sub):
                        original_sub = current_sub[index]
                        current_sub[index] = current_sub[
                            index] + "-" + word.strip()
                        # join the list to make into actual subdomain (aa.bb.cc)
                        actual_sub = ".".join(current_sub)
                        # save full URL as line in file
                        full_url = "{0}.{1}.{2}\n".format(
                            actual_sub, ext.domain, ext.suffix)
                        if len(current_sub[0]) > 0 and actual_sub[:1] is not "-":
                            write_domain(args, wp, full_url)
                        current_sub[index] = original_sub
                        # second dash alteration
                        current_sub[index] = word.strip() + "-" + \
                            current_sub[index]
                        actual_sub = ".".join(current_sub)
                        # save second full URL as line in file
                        full_url = "{0}.{1}.{2}\n".format(
                            actual_sub, ext.domain, ext.suffix)
                        if actual_sub[-1:] is not "-":
                            write_domain(args, wp, full_url)
                        current_sub[index] = original_sub

# adds prefix and suffix word to each subdomain

Source File: __main__.py From altdns with Apache License 2.0

5 votes

def insert_all_indexes(args, alteration_words):
    with open(args.input, "r") as fp:
        with open(args.output_tmp, "a") as wp:
            for line in fp:
                ext = tldextract.extract(line.strip())
                current_sub = ext.subdomain.split(".")
                for word in alteration_words:
                    for index in range(0, len(current_sub)):
                        current_sub.insert(index, word.strip())
                        # join the list to make into actual subdomain (aa.bb.cc)
                        actual_sub = ".".join(current_sub)
                        # save full URL as line in file
                        full_url = "{0}.{1}.{2}\n".format(
                            actual_sub, ext.domain, ext.suffix)
                        if actual_sub[-1:] is not ".":
                            write_domain(args, wp, full_url)
                        current_sub.pop(index)
                    current_sub.append(word.strip())
                    actual_sub = ".".join(current_sub)
                    full_url = "{0}.{1}.{2}\n".format(
                        actual_sub, ext.domain, ext.suffix)
                    if len(current_sub[0]) > 0:
                      write_domain(args, wp, full_url)
                    current_sub.pop()

# adds word-NUM and wordNUM to each subdomain at each unique position

Source File: WebSpider.py From Pansidong with GNU General Public License v3.0

5 votes

def check_domain_limit(self, url):
        for domain in self.limit_domain:
            ext = tldextract.extract(domain)
            # *的时候匹配所有二级域名，或者只匹配特定的域名
            if ((ext[0] == "*" or ext[0] == "") and tldextract.extract(url)[1] == ext[1]) or \
                    (".".join(tldextract.extract(url)) == domain):
                return True

        return False

Source File: BuscadorNoticiasFalsas.py From osint-suite-tools with GNU General Public License v3.0

5 votes

def search_DDG_(target, TEXT_0):

    engine = Duckduckgo()
    results = engine.search("'" + target + "'")
    for r in results:
        print ("|--[INFO][GOOGLE][RESULTS][>] " + r["title"] + " | " + r["text"] + " | " + r["link"])
        
        try:
            tsd, td, tsu = extract(r["link"])
            domain = td + '.' + tsu

            web = requests.get(r["link"], timeout=3)
            print ("|----[INFO][WEB][HTTP CODE][>] " + str(web.status_code) + "\n")

            if web.status_code >= 200 or web.status_code < 300:
                if ".pdf" in r["link"]:
                    pass
                else:
                    if not domain in config.BL_parserPhone:
                        TEXT = er.remove_tags(str(web.text))

                        compareTEXT(TEXT, TEXT_0)
                        parser.FC_words_in_text(TEXT)
                        parser.parserMAIN(TEXT)

                        ratio = compareTEXT(TEXT_0, TEXT)
                        print(f"|----[INFO][COMPARE TEXTS][>] Ratio: {ratio}")
                        
                        #Guardamos la info en un log
                        data = f"{r['title']} ||| {r['link']} ||| {r['text']}, ||| {ratio} \n"
                        generateLOG(data, target)

                    else:
                        pass
            print("")
            time.sleep(2)

        except Exception as e:
            print ("|----[ERROR][HTTP CONNECTION][>] " + str(e))

Source File: BuscadorNoticiasFalsas.py From osint-suite-tools with GNU General Public License v3.0

5 votes

def search_google_(target, TEXT_0):

    engine = Google()
    results = engine.search("'" + target + "'")
    for r in results:
        print ("|--[INFO][GOOGLE][RESULTS][>] " + r["title"] + " | " + r["text"] + " | " + r["link"])
        
        try:
            tsd, td, tsu = extract(r["link"])
            domain = td + '.' + tsu

            web = requests.get(r["link"], timeout=3)
            print ("|----[INFO][WEB][HTTP CODE][>] " + str(web.status_code) + "\n")

            if web.status_code >= 200 or web.status_code < 300:
                if ".pdf" in r["link"]:
                    pass
                else:
                    if not domain in config.BL_parserPhone:
                        TEXT = er.remove_tags(str(web.text))
                        parser.parserMAIN(TEXT)
                        parser.FC_words_in_text(TEXT)

                        ratio = compareTEXT(TEXT_0, TEXT)
                        print(f"|----[INFO][COMPARE TEXTS][>] Ratio: {ratio}")

                        #Guardamos la info en un log
                        data = f"{r['title']} ||| {r['link']} ||| {r['text']}, ||| {ratio} \n"
                        generateLOG(data, target)

                    else:
                        pass
            print("")

        except Exception as e:
            print ("|----[ERROR][HTTP CONNECTION][>] " + str(e))

Source File: __main__.py From altdns with Apache License 2.0

5 votes

def insert_number_suffix_subdomains(args, alternation_words):
    with open(args.input, "r") as fp:
        with open(args.output_tmp, "a") as wp:
            for line in fp:
                ext = tldextract.extract(line.strip())
                current_sub = ext.subdomain.split(".")
                for word in range(0, 10):
                    for index, value in enumerate(current_sub):
                        #add word-NUM
                        original_sub = current_sub[index]
                        current_sub[index] = current_sub[index] + "-" + str(word)
                        # join the list to make into actual subdomain (aa.bb.cc)
                        actual_sub = ".".join(current_sub)
                        # save full URL as line in file
                        full_url = "{0}.{1}.{2}\n".format(actual_sub, ext.domain, ext.suffix)
                        write_domain(args, wp, full_url)
                        current_sub[index] = original_sub

                        #add wordNUM
                        original_sub = current_sub[index]
                        current_sub[index] = current_sub[index] + str(word)
                        # join the list to make into actual subdomain (aa.bb.cc)
                        actual_sub = ".".join(current_sub)
                        # save full URL as line in file
                        full_url = "{0}.{1}.{2}\n".format(actual_sub, ext.domain, ext.suffix)
                        write_domain(args, wp, full_url)
                        current_sub[index] = original_sub

# adds word- and -word to each subdomain at each unique position

Python tldextract.extract() Examples