Python Examples of idna.decode

Source File: tldextract.py From CrawlBox with The Unlicense

6 votes

def _cache_tlds(self, tlds):
        '''Logs a diff of the new TLDs and caches them on disk, according to
        settings passed to __init__.'''
        if LOG.isEnabledFor(logging.DEBUG):
            import difflib
            snapshot_stream = pkg_resources.resource_stream(__name__, '.tld_set_snapshot')
            with closing(snapshot_stream) as snapshot_file:
                snapshot = sorted(
                    json.loads(snapshot_file.read().decode('utf-8'))
                )
            new = sorted(tlds)
            LOG.debug('computed TLD diff:\n' + '\n'.join(difflib.unified_diff(
                snapshot,
                new,
                fromfile=".tld_set_snapshot",
                tofile=self.cache_file
            )))

        if self.cache_file:
            try:
                with open(self.cache_file, 'w') as cache_file:
                    json.dump(tlds, cache_file)
            except IOError as ioe:
                LOG.warn("unable to cache TLDs in file %s: %s", self.cache_file, ioe)

Source File: __init__.py From Galaxy_Plugin_Bethesda with MIT License

6 votes

def _encode_host(cls, host):
        try:
            ip, sep, zone = host.partition("%")
            ip = ip_address(ip)
        except ValueError:
            try:
                host = idna.encode(host, uts46=True).decode("ascii")
            except UnicodeError:
                host = host.encode("idna").decode("ascii")
        else:
            host = ip.compressed
            if sep:
                host += "%" + zone
            if ip.version == 6:
                host = "[" + host + "]"
        return host

Source File: __init__.py From Galaxy_Plugin_Bethesda with MIT License

6 votes

def host(self):
        """Decoded host part of URL.

        None for relative URLs.

        """
        raw = self.raw_host
        if raw is None:
            return None
        if "%" in raw:
            # Hack for scoped IPv6 addresses like
            # fe80::2%Проверка
            # presence of '%' sign means only IPv6 address, so idna is useless.
            return raw

        try:
            return idna.decode(raw.encode("ascii"))
        except UnicodeError:  # e.g. '::1'
            return raw.encode("ascii").decode("idna")

Source File: _idna.py From learn_python3_spider with MIT License

6 votes

def _idnaText(octets):
    """
    Convert some IDNA-encoded octets into some human-readable text.

    Currently only used by the tests.

    @param octets: Some bytes representing a hostname.
    @type octets: L{bytes}

    @return: A human-readable domain name.
    @rtype: L{unicode}
    """
    try:
        import idna
    except ImportError:
        return octets.decode("idna")
    else:
        return idna.decode(octets)

Source File: __init__.py From yarl with Apache License 2.0

6 votes

def _encode_host(cls, host):
            try:
                ip, sep, zone = host.partition("%")
                ip = ip_address(ip)
            except ValueError:
                for char in host:
                    if char > "\x7f":
                        break
                else:
                    return host
                try:
                    host = idna.encode(host, uts46=True).decode("ascii")
                except UnicodeError:
                    host = host.encode("idna").decode("ascii")
            else:
                host = ip.compressed
                if sep:
                    host += "%" + zone
                if ip.version == 6:
                    host = "[" + host + "]"
            return host

Source File: __init__.py From yarl with Apache License 2.0

6 votes

def _encode_host(cls, host):
            try:
                ip, sep, zone = host.partition("%")
                ip = ip_address(ip)
            except ValueError:
                # IDNA encoding is slow,
                # skip it for ASCII-only strings
                if host.isascii():
                    return host
                try:
                    host = idna.encode(host, uts46=True).decode("ascii")
                except UnicodeError:
                    host = host.encode("idna").decode("ascii")
            else:
                host = ip.compressed
                if sep:
                    host += "%" + zone
                if ip.version == 6:
                    host = "[" + host + "]"
            return host

Source File: __init__.py From yarl with Apache License 2.0

6 votes

def host(self):
        """Decoded host part of URL.

        None for relative URLs.

        """
        raw = self.raw_host
        if raw is None:
            return None
        if "%" in raw:
            # Hack for scoped IPv6 addresses like
            # fe80::2%Проверка
            # presence of '%' sign means only IPv6 address, so idna is useless.
            return raw
        try:
            return idna.decode(raw.encode("ascii"))
        except UnicodeError:  # e.g. '::1'
            return raw.encode("ascii").decode("idna")

Source File: _idna.py From Safejumper-for-Desktop with GNU General Public License v2.0

6 votes

def _idnaText(octets):
    """
    Convert some IDNA-encoded octets into some human-readable text.

    Currently only used by the tests.

    @param octets: Some bytes representing a hostname.
    @type octets: L{bytes}

    @return: A human-readable domain name.
    @rtype: L{unicode}
    """
    try:
        import idna
    except ImportError:
        return octets.decode("idna")
    else:
        return idna.decode(octets)

Source File: decode_asn1.py From Safejumper-for-Desktop with GNU General Public License v2.0

5 votes

def _decode_certificate_policies(backend, cp):
    cp = backend._ffi.cast("Cryptography_STACK_OF_POLICYINFO *", cp)
    cp = backend._ffi.gc(cp, backend._lib.sk_POLICYINFO_free)
    num = backend._lib.sk_POLICYINFO_num(cp)
    certificate_policies = []
    for i in range(num):
        qualifiers = None
        pi = backend._lib.sk_POLICYINFO_value(cp, i)
        oid = x509.ObjectIdentifier(_obj2txt(backend, pi.policyid))
        if pi.qualifiers != backend._ffi.NULL:
            qnum = backend._lib.sk_POLICYQUALINFO_num(pi.qualifiers)
            qualifiers = []
            for j in range(qnum):
                pqi = backend._lib.sk_POLICYQUALINFO_value(
                    pi.qualifiers, j
                )
                pqualid = x509.ObjectIdentifier(
                    _obj2txt(backend, pqi.pqualid)
                )
                if pqualid == CertificatePoliciesOID.CPS_QUALIFIER:
                    cpsuri = backend._ffi.buffer(
                        pqi.d.cpsuri.data, pqi.d.cpsuri.length
                    )[:].decode('ascii')
                    qualifiers.append(cpsuri)
                else:
                    assert pqualid == CertificatePoliciesOID.CPS_USER_NOTICE
                    user_notice = _decode_user_notice(
                        backend, pqi.d.usernotice
                    )
                    qualifiers.append(user_notice)

        certificate_policies.append(
            x509.PolicyInformation(oid, qualifiers)
        )

    return x509.CertificatePolicies(certificate_policies)

Source File: cachefile.py From URLExtract with MIT License

5 votes

def _load_cached_tlds(self):
        """
        Loads TLDs from cached file to set.

        :return: Set of current TLDs
        :rtype: set
        """

        # check if cached file is readable
        if not os.access(self._tld_list_path, os.R_OK):
            self._logger.error("Cached file is not readable for current "
                               "user. ({})".format(self._tld_list_path))
            raise CacheFileError(
                "Cached file is not readable for current user."
            )

        set_of_tlds = set()

        with filelock.FileLock(self._get_cache_lock_file_path()):
            with open(self._tld_list_path, 'r') as f_cache_tld:
                for line in f_cache_tld:
                    tld = line.strip().lower()
                    # skip empty lines
                    if not tld:
                        continue
                    # skip comments
                    if tld[0] == '#':
                        continue

                    set_of_tlds.add("." + tld)
                    set_of_tlds.add("." + idna.decode(tld))

        return set_of_tlds

Source File: decode_asn1.py From Safejumper-for-Desktop with GNU General Public License v2.0

5 votes

def _asn1_string_to_utf8(backend, asn1_string):
    buf = backend._ffi.new("unsigned char **")
    res = backend._lib.ASN1_STRING_to_UTF8(buf, asn1_string)
    if res == -1:
        raise ValueError(
            "Unsupported ASN1 string type. Type: {0}".format(asn1_string.type)
        )

    backend.openssl_assert(buf[0] != backend._ffi.NULL)
    buf = backend._ffi.gc(
        buf, lambda buffer: backend._lib.OPENSSL_free(buffer[0])
    )
    return backend._ffi.buffer(buf[0], res)[:].decode('utf8')

Source File: decode_asn1.py From Safejumper-for-Desktop with GNU General Public License v2.0

5 votes

def _asn1_string_to_ascii(backend, asn1_string):
    return _asn1_string_to_bytes(backend, asn1_string).decode("ascii")

Source File: decode_asn1.py From Safejumper-for-Desktop with GNU General Public License v2.0

5 votes

def _obj2txt(backend, obj):
    # Set to 80 on the recommendation of
    # https://www.openssl.org/docs/crypto/OBJ_nid2ln.html#return_values
    buf_len = 80
    buf = backend._ffi.new("char[]", buf_len)
    res = backend._lib.OBJ_obj2txt(buf, buf_len, obj, 1)
    backend.openssl_assert(res > 0)
    return backend._ffi.buffer(buf, res)[:].decode()

Source File: address.py From isthislegit with BSD 3-Clause "New" or "Revised" License

5 votes

def _lift_parse_list_result(parse_list_rs):
    addr_list_obj = AddressList()
    bad_list = []
    for parse_rs in parse_list_rs:
        addr_obj = _lift_parse_result(parse_rs)
        if not addr_obj:
            if isinstance(parse_rs, Mailbox):
                bad_list.append(u'%s@%s' % (parse_rs.local_part.decode('utf-8'),
                                            parse_rs.domain.decode('utf-8')))
            continue

        addr_list_obj.append(addr_obj)

    return addr_list_obj, bad_list

Source File: address.py From isthislegit with BSD 3-Clause "New" or "Revised" License

5 votes

def _lift_parse_result(parse_rs):
    if isinstance(parse_rs, Mailbox):
        try:
            return EmailAddress(
                display_name=smart_unquote(parse_rs.display_name.decode('utf-8')),
                mailbox=parse_rs.local_part.decode('utf-8'),
                hostname=parse_rs.domain.decode('utf-8'))
        except (UnicodeError, IDNAError):
            return None

    if isinstance(parse_rs, Url):
        return UrlAddress(address=parse_rs.address.decode('utf-8'))

    return None

Source File: x509.py From oss-ftp with MIT License

5 votes

def _decode_invalidity_date(backend, inv_date):
    generalized_time = backend._ffi.cast(
        "ASN1_GENERALIZEDTIME *", inv_date
    )
    generalized_time = backend._ffi.gc(
        generalized_time, backend._lib.ASN1_GENERALIZEDTIME_free
    )
    time = backend._ffi.string(
        backend._lib.ASN1_STRING_data(
            backend._ffi.cast("ASN1_STRING *", generalized_time)
        )
    ).decode("ascii")
    return datetime.datetime.strptime(time, "%Y%m%d%H%M%SZ")

Source File: x509.py From oss-ftp with MIT License

5 votes

def _decode_certificate_policies(backend, cp):
    cp = backend._ffi.cast("Cryptography_STACK_OF_POLICYINFO *", cp)
    cp = backend._ffi.gc(cp, backend._lib.sk_POLICYINFO_free)
    num = backend._lib.sk_POLICYINFO_num(cp)
    certificate_policies = []
    for i in range(num):
        qualifiers = None
        pi = backend._lib.sk_POLICYINFO_value(cp, i)
        oid = x509.ObjectIdentifier(_obj2txt(backend, pi.policyid))
        if pi.qualifiers != backend._ffi.NULL:
            qnum = backend._lib.sk_POLICYQUALINFO_num(pi.qualifiers)
            qualifiers = []
            for j in range(qnum):
                pqi = backend._lib.sk_POLICYQUALINFO_value(
                    pi.qualifiers, j
                )
                pqualid = x509.ObjectIdentifier(
                    _obj2txt(backend, pqi.pqualid)
                )
                if pqualid == CertificatePoliciesOID.CPS_QUALIFIER:
                    cpsuri = backend._ffi.buffer(
                        pqi.d.cpsuri.data, pqi.d.cpsuri.length
                    )[:].decode('ascii')
                    qualifiers.append(cpsuri)
                else:
                    assert pqualid == CertificatePoliciesOID.CPS_USER_NOTICE
                    user_notice = _decode_user_notice(
                        backend, pqi.d.usernotice
                    )
                    qualifiers.append(user_notice)

        certificate_policies.append(
            x509.PolicyInformation(oid, qualifiers)
        )

    return x509.CertificatePolicies(certificate_policies)

Source File: x509.py From oss-ftp with MIT License

5 votes

def _obj2txt(backend, obj):
    # Set to 80 on the recommendation of
    # https://www.openssl.org/docs/crypto/OBJ_nid2ln.html#return_values
    buf_len = 80
    buf = backend._ffi.new("char[]", buf_len)
    res = backend._lib.OBJ_obj2txt(buf, buf_len, obj, 1)
    backend.openssl_assert(res > 0)
    return backend._ffi.buffer(buf, res)[:].decode()

Source File: tldextract.py From CrawlBox with The Unlicense

5 votes

def _get_snapshot_tld_extractor():
        snapshot_stream = pkg_resources.resource_stream(__name__, '.tld_set_snapshot')
        with closing(snapshot_stream) as snapshot_file:
            return json.loads(snapshot_file.read().decode('utf-8'))

Source File: tldextract.py From CrawlBox with The Unlicense

4 votes

def __call__(self, url):
        """
        Takes a string URL and splits it into its subdomain, domain, and
        suffix (effective TLD, gTLD, ccTLD, etc.) component.

        >>> extract = TLDExtract()
        >>> extract('http://forums.news.cnn.com/')
        ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
        >>> extract('http://forums.bbc.co.uk/')
        ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
        """
        netloc = SCHEME_RE.sub("", url) \
            .partition("/")[0] \
            .partition("?")[0] \
            .partition("#")[0] \
            .split("@")[-1] \
            .partition(":")[0] \
            .strip() \
            .rstrip(".")

        labels = netloc.split(".")
        translations = []
        for label in labels:
            if label.startswith("xn--"):
                try:
                    translation = idna.decode(label.encode('ascii'))
                except UnicodeError:
                    translation = label
            else:
                translation = label
            translation = translation.lower()

            translations.append(translation)

        suffix_index = self._get_tld_extractor().suffix_index(translations)

        registered_domain = ".".join(labels[:suffix_index])
        suffix = ".".join(labels[suffix_index:])

        if not suffix and netloc and looks_like_ip(netloc):
            return ExtractResult('', netloc, '')

        subdomain, _, domain = registered_domain.rpartition('.')
        return ExtractResult(subdomain, domain, suffix)

Source File: address.py From isthislegit with BSD 3-Clause "New" or "Revised" License

4 votes

def __init__(self, raw_display_name=None, raw_addr_spec=None, display_name=None, mailbox=None, hostname=None):
        if isinstance(raw_display_name, unicode):
            raw_display_name = raw_display_name.encode('utf-8')
        if isinstance(raw_addr_spec, unicode):
            raw_addr_spec = raw_addr_spec.encode('utf-8')

        if raw_display_name and raw_addr_spec:
            parser = addr_spec_parser
            mailbox = parser.parse(raw_addr_spec.strip(), lexer=lexer.clone())

            self._display_name = raw_display_name
            self._mailbox = mailbox.local_part
            self._hostname = mailbox.domain

        elif raw_display_name:
            parser = mailbox_parser
            mailbox = parser.parse(raw_display_name.strip(), lexer=lexer.clone())

            self._display_name = mailbox.display_name
            self._mailbox = mailbox.local_part
            self._hostname = mailbox.domain

        elif raw_addr_spec:
            parser = addr_spec_parser
            mailbox = parser.parse(raw_addr_spec.strip(), lexer=lexer.clone())

            self._display_name = ''
            self._mailbox = mailbox.local_part
            self._hostname = mailbox.domain

        elif mailbox and hostname:
            self._display_name = display_name or ''
            self._mailbox = mailbox
            self._hostname = hostname

        else:
            raise SyntaxError('failed to create EmailAddress: bad parameters')

        # Convert display name to decoded unicode string.
        if (self._display_name.startswith('=?') and
                self._display_name.endswith('?=')):
            self._display_name = mime_to_unicode(self._display_name)
        if (self._display_name.startswith('"') and
                self._display_name.endswith('"') and
                len(self._display_name) > 2):
            self._display_name = smart_unquote(self._display_name)
        if isinstance(self._display_name, str):
            self._display_name = self._display_name.decode('utf-8')

        # Convert localpart to unicode string.
        if isinstance(self._mailbox, str):
            self._mailbox = self._mailbox.decode('utf-8')

        # Convert hostname to lowercase unicode string.
        self._hostname = self._hostname.lower()
        if self._hostname.startswith('xn--') or '.xn--' in self._hostname:
            self._hostname = idna.decode(self._hostname)
        if isinstance(self._hostname, str):
            self._hostname = self._hostname.decode('utf-8')
        if not is_pure_ascii(self._hostname):
            idna.encode(self._hostname)

Source File: _url.py From learn_python3_spider with MIT License

4 votes

def _decode_host(host):
    """Decode a host from ASCII-encodable text to IDNA-decoded text. If
    the host text is not ASCII, it is returned unchanged, as it is
    presumed that it is already IDNA-decoded.

    Some technical details: _decode_host is built on top of the "idna"
    package, which has some quirks:

    Capital letters are not valid IDNA2008. The idna package will
    raise an exception like this on capital letters:

    > idna.core.InvalidCodepoint: Codepoint U+004B at position 1 ... not allowed

    However, if a segment of a host (i.e., something in
    url.host.split('.')) is already ASCII, idna doesn't perform its
    usual checks. In fact, for capital letters it automatically
    lowercases them.

    This check and some other functionality can be bypassed by passing
    uts46=True to idna.encode/decode. This allows a more permissive and
    convenient interface. So far it seems like the balanced approach.

    Example output (from idna==2.6):

    >> idna.encode(u'mahmöud.io')
    'xn--mahmud-zxa.io'
    >> idna.encode(u'Mahmöud.io')
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 355, in encode
        result.append(alabel(label))
      File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
        check_label(label)
      File "/home/mahmoud/virtualenvs/hyperlink/local/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
        raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
    idna.core.InvalidCodepoint: Codepoint U+004D at position 1 of u'Mahm\xf6ud' not allowed
    >> idna.encode(u'Mahmoud.io')
    'Mahmoud.io'

    # Similar behavior for decodes below
    >> idna.decode(u'Mahmoud.io')
    u'mahmoud.io
    >> idna.decode(u'Méhmoud.io', uts46=True)
    u'm\xe9hmoud.io'
    """
    if not host:
        return u''
    try:
        host_bytes = host.encode("ascii")
    except UnicodeEncodeError:
        host_text = host
    else:
        try:
            host_text = idna_decode(host_bytes, uts46=True)
        except ValueError:
            # only reached on "narrow" (UCS-2) Python builds <3.4, see #7
            # NOTE: not going to raise here, because there's no
            # ambiguity in the IDNA, and the host is still
            # technically usable
            host_text = host
    return host_text

Source File: cachefile.py From URLExtract with MIT License

4 votes

def _download_tlds_list(self):
        """
        Function downloads list of TLDs from IANA.
        LINK: https://data.iana.org/TLD/tlds-alpha-by-domain.txt

        :return: True if list was downloaded, False in case of an error
        :rtype: bool
        """
        url_list = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'

        # Default cache file exist (set by _default_cache_file)
        # and we want to write permission
        if self._default_cache_file and \
                not os.access(self._tld_list_path, os.W_OK):
            self._logger.info("Default cache file is not writable.")
            self._tld_list_path = self._get_cache_file_path()
            self._logger.info(
                "Changed path of cache file to: %s",
                self._tld_list_path
            )

        if os.access(self._tld_list_path, os.F_OK) and \
                not os.access(self._tld_list_path, os.W_OK):
            self._logger.error("ERROR: Cache file is not writable for current "
                               "user. ({})".format(self._tld_list_path))
            return False

        req = urllib.request.Request(url_list)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.0; '
                                     'WOW64; rv:24.0) Gecko/20100101 '
                                     'Firefox/24.0')
        try:
            with urllib.request.urlopen(req) as f:
                page = f.read().decode('utf-8')
        except HTTPError as e:
            self._logger.error("ERROR: Can not download list ot TLDs. "
                               "(HTTPError: {})".format(e.reason))
            return False
        except URLError as e:
            self._logger.error("ERROR: Can not download list ot TLDs. "
                               "(URLError: {})".format(e.reason))
            return False

        with filelock.FileLock(self._get_cache_lock_file_path()):
            with open(self._tld_list_path, 'w') as ftld:
                ftld.write(page)

        return True

Python idna.decode() Examples