Python Examples of urllib.parse.urlsplit

Source File: base.py From bioforum with MIT License

6 votes

def translate_url(url, lang_code):
    """
    Given a URL (absolute or relative), try to get its translated version in
    the `lang_code` language (either by i18n_patterns or by translated regex).
    Return the original URL if no translated version is found.
    """
    parsed = urlsplit(url)
    try:
        match = resolve(parsed.path)
    except Resolver404:
        pass
    else:
        to_be_reversed = "%s:%s" % (match.namespace, match.url_name) if match.namespace else match.url_name
        with override(lang_code):
            try:
                url = reverse(to_be_reversed, args=match.args, kwargs=match.kwargs)
            except NoReverseMatch:
                pass
            else:
                url = urlunsplit((parsed.scheme, parsed.netloc, url, parsed.query, parsed.fragment))
    return url

Source File: actionform.py From gprime with GNU General Public License v2.0

6 votes

def upload(url, filename=None):
    from urllib.request import Request, urlopen
    from urllib.parse import urlsplit
    import shutil
    def getFilename(url,openUrl):
        if 'Content-Disposition' in openUrl.info():
            # If the response has Content-Disposition, try to get filename from it
            cd = dict([x.strip().split('=') if '=' in x else (x.strip(),'')
                                        for x in openUrl.info().split(';')])
            if 'filename' in cd:
                fname = cd['filename'].strip("\"'")
                if fname: return fname
        # if no filename was found above, parse it out of the final URL.
        return os.path.basename(urlsplit(openUrl.url)[2])
    r = urlopen(Request(url))
    success = None
    try:
        filename = filename or "/tmp/%s" % getFilename(url,r)
        with open(filename, 'wb') as f:
            shutil.copyfileobj(r,f)
        success = filename
    finally:
        r.close()
    return success

Source File: http_headers_plugin.py From sslyze with GNU Affero General Public License v3.0

6 votes

def _detect_http_redirection(http_response: HTTPResponse, server_host_name: str, server_port: int) -> Optional[str]:
    """If the HTTP response contains a redirection to the same server, return the path to the new location.
    """
    next_location_path = None
    if 300 <= http_response.status < 400:
        location_header = _extract_first_header_value(http_response, "Location")
        if location_header:
            parsed_location = urlsplit(location_header)
            is_relative_url = False if parsed_location.hostname else True
            if is_relative_url:
                # Yes, to a relative URL; follow the redirection
                next_location_path = location_header
            else:
                is_absolute_url_to_same_hostname = parsed_location.hostname == server_host_name
                absolute_url_port = 443 if parsed_location.port is None else parsed_location.port
                is_absolute_url_to_same_port = absolute_url_port == server_port
                if is_absolute_url_to_same_hostname and is_absolute_url_to_same_port:
                    # Yes, to an absolute URL to the same server; follow the redirection
                    next_location_path = f"{parsed_location.path}"
                    if parsed_location.query:
                        next_location_path += f"?{parsed_location.query}"

    return next_location_path

Source File: zmirror.py From zmirror with MIT License

6 votes

def extract_url_path_and_query(full_url=None, no_query=False):
    """
    Convert http://foo.bar.com/aaa/p.html?x=y to /aaa/p.html?x=y

    :param no_query:
    :type full_url: str
    :param full_url: full url
    :return: str
    """
    if full_url is None:
        full_url = request.url
    split = urlsplit(full_url)
    result = split.path or "/"
    if not no_query and split.query:
        result += '?' + split.query
    return result


# ################# End Client Request Handler #################


# ################# Begin Middle Functions #################

Source File: storage_manager.py From OasisPlatform with BSD 3-Clause "New" or "Revised" License

6 votes

def _strip_signing_parameters(self, url):
        """ Duplicated Unsiged URLs from Django-Stroage

        Method from: https://github.com/jschneier/django-storages/blob/master/storages/backends/s3boto3.py

        Boto3 does not currently support generating URLs that are unsigned. Instead we
        take the signed URLs and strip any querystring params related to signing and expiration.
        Note that this may end up with URLs that are still invalid, especially if params are
        passed in that only work with signed URLs, e.g. response header params.
        The code attempts to strip all query parameters that match names of known parameters
        from v2 and v4 signatures, regardless of the actual signature version used.
        """
        split_url = urlsplit(url)
        qs = parse_qsl(split_url.query, keep_blank_values=True)
        blacklist = {
            'x-amz-algorithm', 'x-amz-credential', 'x-amz-date',
            'x-amz-expires', 'x-amz-signedheaders', 'x-amz-signature',
            'x-amz-security-token', 'awsaccesskeyid', 'expires', 'signature',
        }
        filtered_qs = ((key, val) for key, val in qs if key.lower() not in blacklist)
        # Note: Parameters that did not have a value in the original query string will have
        # an '=' sign appended to it, e.g ?foo&bar becomes ?foo=&bar=
        joined_qs = ('='.join(keyval) for keyval in filtered_qs)
        split_url = split_url._replace(query="&".join(joined_qs))
        return split_url.geturl()

Source File: cache.py From openSUSE-release-tools with GNU General Public License v2.0

6 votes

def path(url, project, include_file=False, makedirs=False):
        if not Cache.CACHE_DIR:
            raise Exception('Cache.init() must be called first')

        parts = [Cache.CACHE_DIR]

        o = urlsplit(url)
        parts.append(o.hostname)

        if project:
            parts.append(project)

        directory = os.path.join(*parts)
        if not os.path.exists(directory) and makedirs:
            os.makedirs(directory)

        if include_file:
            parts.append(hashlib.sha1(url.encode('utf-8')).hexdigest())
            return os.path.join(*parts)

        return directory

Source File: storage.py From bioforum with MIT License

6 votes

def stored_name(self, name):
        parsed_name = urlsplit(unquote(name))
        clean_name = parsed_name.path.strip()
        hash_key = self.hash_key(clean_name)
        cache_name = self.hashed_files.get(hash_key)
        if cache_name is None:
            if self.manifest_strict:
                raise ValueError("Missing staticfiles manifest entry for '%s'" % clean_name)
            cache_name = self.clean_name(self.hashed_name(name))
        unparsed_name = list(parsed_name)
        unparsed_name[2] = cache_name
        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        if '?#' in name and not unparsed_name[3]:
            unparsed_name[2] += '?'
        return urlunsplit(unparsed_name)

Source File: cache.py From openSUSE-release-tools with GNU General Public License v2.0

6 votes

def delete(url):
        url = unquote(url)
        match, project = Cache.match(url)
        if match:
            path = Cache.path(url, project, include_file=True)

            # Rather then wait for last updated statistics to expire, remove the
            # project cache if applicable.
            if project:
                apiurl, _ = Cache.spliturl(url)
                if project.isdigit():
                    # Clear target project cache upon request acceptance.
                    project = osc.core.get_request(apiurl, project).actions[0].tgt_project
                Cache.delete_project(apiurl, project)

            if os.path.exists(path):
                if conf.config['debug']: print('CACHE_DELETE', url, file=sys.stderr)
                os.remove(path)

        # Also delete version without query. This does not handle other
        # variations using different query strings. Handy for PUT with ?force=1.
        o = urlsplit(url)
        if o.query != '':
            url_plain = SplitResult(o.scheme, o.netloc, o.path, '', o.fragment).geturl()
            Cache.delete(url_plain)

Source File: cache.py From openSUSE-release-tools with GNU General Public License v2.0

6 votes

def delete(url):
        url = unquote(url)
        match, project = Cache.match(url)
        if match:
            path = Cache.path(url, project, include_file=True)

            # Rather then wait for last updated statistics to expire, remove the
            # project cache if applicable.
            if project:
                apiurl, _ = Cache.spliturl(url)
                if project.isdigit():
                    # Clear target project cache upon request acceptance.
                    project = osc.core.get_request(apiurl, project).actions[0].tgt_project
                Cache.delete_project(apiurl, project)

            if os.path.exists(path):
                if conf.config['debug']: print('CACHE_DELETE', url, file=sys.stderr)
                os.remove(path)

        # Also delete version without query. This does not handle other
        # variations using different query strings. Handy for PUT with ?force=1.
        o = urlsplit(url)
        if o.query != '':
            url_plain = SplitResult(o.scheme, o.netloc, o.path, '', o.fragment).geturl()
            Cache.delete(url_plain)

Source File: request.py From bioforum with MIT License

6 votes

def build_absolute_uri(self, location=None):
        """
        Build an absolute URI from the location and the variables available in
        this request. If no ``location`` is specified, bulid the absolute URI
        using request.get_full_path(). If the location is absolute, convert it
        to an RFC 3987 compliant URI and return it. If location is relative or
        is scheme-relative (i.e., ``//example.com/``), urljoin() it to a base
        URL constructed from the request variables.
        """
        if location is None:
            # Make it an absolute url (but schemeless and domainless) for the
            # edge case that the path starts with '//'.
            location = '//%s' % self.get_full_path()
        bits = urlsplit(location)
        if not (bits.scheme and bits.netloc):
            current_uri = '{scheme}://{host}{path}'.format(scheme=self.scheme,
                                                           host=self.get_host(),
                                                           path=self.path)
            # Join the constructed URL with the provided location, which will
            # allow the provided ``location`` to apply query strings to the
            # base path as well as override the host, if it begins with //
            location = urljoin(current_uri, location)
        return iri_to_uri(location)

Source File: tftp.py From dionaea with GNU General Public License v2.0

6 votes

def handle_incident(self, icd):
        url = icd.get("url")
        if isinstance(url, bytes):
            try:
                url = url.decode(encoding="utf-8")
            except UnicodeEncodeError as e:
                logger.warning("Error decoding URL %s", url, exc_info=True)
                return

        if url.startswith('tftp://'):
            # python fails parsing tftp://, ftp:// works, so ...
            logger.info("do download")
            x = parse.urlsplit(url[1:])
            if x.netloc == '0.0.0.0':
                logger.info("Discarding download from INADDR_ANY")
                return
            try:
                con = icd.con
            except AttributeError:
                con = None
            t=TftpClient()
            t.download(con, x.netloc, 69, x.path[1:], url)

Source File: base.py From bugatsinho.github.io with GNU General Public License v3.0

6 votes

def unshorten(self, uri, type=None):

        domain = urlsplit(uri).netloc

        if not domain:
            return uri, "No domain found in URI!"

        had_google_outbound, uri = self._clear_google_outbound_proxy(uri)

        if re.search(self._adfly_regex, domain, re.IGNORECASE) or type == 'adfly':
            return self._unshorten_adfly(uri)
        if re.search(self._adfocus_regex, domain, re.IGNORECASE) or type == 'adfocus':
            return self._unshorten_adfocus(uri)
        if re.search(self._linkbucks_regex, domain, re.IGNORECASE) or type == 'linkbucks':
            return self._unshorten_linkbucks(uri)
        if re.search(self._lnxlu_regex, domain, re.IGNORECASE) or type == 'lnxlu':
            return self._unshorten_lnxlu(uri)
        if re.search(self._shst_regex, domain, re.IGNORECASE):
            return self._unshorten_shst(uri)
        if re.search(self._hrefli_regex, domain, re.IGNORECASE):
            return self._unshorten_hrefli(uri)
        if re.search(self._anonymz_regex, domain, re.IGNORECASE):
            return self._unshorten_anonymz(uri)

        return uri, 200

Source File: yum.py From atomic-reactor with BSD 3-Clause "New" or "Revised" License

6 votes

def filename(self):
        '''Returns the filename to be used for saving the repo file.

        The filename is derived from the repo url by injecting a suffix
        after the name and before the file extension. This suffix is a
        partial md5 checksum of the full repourl. This avoids multiple
        repos from being written to the same file.
        '''
        urlpath = unquote(urlsplit(self.repourl, allow_fragments=False).path)
        basename = os.path.basename(urlpath)
        if not basename.endswith(REPO_SUFFIX):
            basename += REPO_SUFFIX
        if self.add_hash:
            suffix = '-' + md5(self.repourl.encode('utf-8')).hexdigest()[:5]  # nosec
        else:
            suffix = ''
        final_name = suffix.join(os.path.splitext(basename))
        return final_name

Source File: modifier.py From selenium-wire with MIT License

6 votes

def _rewrite_url(self, request):
        with self._lock:
            rewrite_rules = self._rewrite_rules[:]

        original_netloc = urlsplit(request.path).netloc

        for pattern, replacement in rewrite_rules:
            modified, count = pattern.subn(replacement, request.path)

            if count > 0:
                request.path = modified
                break

        modified_netloc = urlsplit(request.path).netloc

        if original_netloc != modified_netloc:
            # Modify the Host header if it exists
            if 'Host' in request.headers:
                request.headers['Host'] = modified_netloc

Source File: test_confirmable.py From flask-security with MIT License

6 votes

def test_spa_get(app, client):
    """
    Test 'single-page-application' style redirects
    This uses json only.
    """
    with capture_flashes() as flashes:
        with capture_registrations() as registrations:
            response = client.post(
                "/register",
                json=dict(email="dude@lp.com", password="awesome sunset"),
                headers={"Content-Type": "application/json"},
            )
            assert response.headers["Content-Type"] == "application/json"
        token = registrations[0]["confirm_token"]

        response = client.get("/confirm/" + token)
        assert response.status_code == 302
        split = urlsplit(response.headers["Location"])
        assert "localhost:8081" == split.netloc
        assert "/confirm-redirect" == split.path
        qparams = dict(parse_qsl(split.query))
        assert qparams["email"] == "dude@lp.com"
    # Arguably for json we shouldn't have any - this is buried in register_user
    # but really shouldn't be.
    assert len(flashes) == 1

Source File: test_unified_signin.py From flask-security with MIT License

6 votes

def test_tf_link_spa(app, client, get_message):
    # Verify two-factor required when using magic link and SPA
    # This currently isn't supported and should redirect to an error.
    with app.mail.record_messages() as outbox:
        response = client.post(
            "/us-signin/send-code",
            data=dict(identity="matt@lp.com", chosen_method="email"),
            follow_redirects=True,
        )
        assert response.status_code == 200
        assert b"Sign In" in response.data

    matcher = re.match(
        r".*(http://[^\s*]*).*", outbox[0].body, re.IGNORECASE | re.DOTALL
    )
    magic_link = matcher.group(1)
    response = client.get(magic_link, follow_redirects=False)
    split = urlsplit(response.location)
    assert "localhost:8081" == split.netloc
    assert "/login-error" == split.path
    qparams = dict(parse_qsl(split.query))
    assert qparams["tf_required"] == "1"
    assert qparams["email"] == "matt@lp.com"

Source File: test_passwordless.py From flask-security with MIT License

6 votes

def test_spa_get(app, client):
    """
    Test 'single-page-application' style redirects
    This uses json only.
    """
    with capture_flashes() as flashes:
        with capture_passwordless_login_requests() as requests:
            response = client.post(
                "/login",
                json=dict(email="matt@lp.com"),
                headers={"Content-Type": "application/json"},
            )
            assert response.headers["Content-Type"] == "application/json"
        token = requests[0]["login_token"]

        response = client.get("/login/" + token)
        assert response.status_code == 302
        split = urlsplit(response.headers["Location"])
        assert "localhost:8081" == split.netloc
        assert "/login-redirect" == split.path
        qparams = dict(parse_qsl(split.query))
        assert qparams["email"] == "matt@lp.com"
    assert len(flashes) == 0

Source File: utils.py From flask-security with MIT License

6 votes

def transform_url(url, qparams=None, **kwargs):
    """ Modify url

    :param url: url to transform (can be relative)
    :param qparams: additional query params to add to end of url
    :param kwargs: pieces of URL to modify - e.g. netloc=localhost:8000
    :return: Modified URL

    .. versionadded:: 3.2.0
    """
    if not url:
        return url
    link_parse = urlsplit(url)
    if qparams:
        current_query = dict(parse_qsl(link_parse.query))
        current_query.update(qparams)
        link_parse = link_parse._replace(query=urlencode(current_query))
    return urlunsplit(link_parse._replace(**kwargs))

Source File: client.py From bioforum with MIT License

5 votes

def _handle_redirects(self, response, **extra):
        """
        Follow any redirects by requesting responses from the server using GET.
        """
        response.redirect_chain = []
        while response.status_code in (301, 302, 303, 307):
            response_url = response.url
            redirect_chain = response.redirect_chain
            redirect_chain.append((response_url, response.status_code))

            url = urlsplit(response_url)
            if url.scheme:
                extra['wsgi.url_scheme'] = url.scheme
            if url.hostname:
                extra['SERVER_NAME'] = url.hostname
            if url.port:
                extra['SERVER_PORT'] = str(url.port)

            # Prepend the request path to handle relative path redirects
            path = url.path
            if not path.startswith('/'):
                path = urljoin(response.request['PATH_INFO'], path)

            response = self.get(path, QueryDict(url.query), follow=False, **extra)
            response.redirect_chain = redirect_chain

            if redirect_chain[-1] in redirect_chain[:-1]:
                # Check that we're not redirecting to somewhere we've already
                # been to, to prevent loops.
                raise RedirectCycleError("Redirect loop detected.", last_response=response)
            if len(redirect_chain) > 20:
                # Such a lengthy chain likely also means a loop, but one with
                # a growing path, changing view, or changing query argument;
                # 20 is the value of "network.http.redirection-limit" from Firefox.
                raise RedirectCycleError("Too many redirects.", last_response=response)

        return response

Source File: __init__.py From bdbag with Apache License 2.0

5 votes

def inspect_path(path):
    abs_path = os.path.abspath(path)
    exists = os.path.exists(abs_path)
    is_uri = is_file = is_dir = False
    if not exists:
        upr = urlsplit(path)
        drive, tail = os.path.splitdrive(path)
        if upr.scheme and upr.scheme.lower() != drive.rstrip(":").lower():
            is_uri = True
    if not is_uri:
        is_file = os.path.isfile(abs_path)
        is_dir = os.path.isdir(abs_path)

    return is_file, is_dir, is_uri

Source File: Downloader.py From OMR-Datasets with MIT License

5 votes

def download_file(url, destination_filename=None) -> str:
        u = urllib2.urlopen(url)
        scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
        filename = os.path.basename(path)
        if not filename:
            filename = 'downloaded.file'
        if destination_filename:
            filename = destination_filename

        filename = os.path.abspath(filename)

        with open(filename, 'wb') as f:
            meta = u.info()
            meta_func = meta.getheaders if hasattr(meta, 'getheaders') else meta.get_all
            meta_length = meta_func("Content-Length")
            file_size = None
            if meta_length:
                file_size = int(meta_length[0])
            print("Downloading: {0} Bytes: {1} into {2}".format(url, file_size, filename))

            with tqdm(total=file_size, desc="Downloading (bytes)") as progress_bar:
                file_size_dl = 0
                block_sz = 8192
                while True:
                    buffer = u.read(block_sz)
                    if not buffer:
                        break

                    file_size_dl += len(buffer)
                    f.write(buffer)
                    if file_size:
                        progress_bar.update(len(buffer))
            print()

        return filename

Source File: vcs_helpers.py From python-semantic-release with MIT License

5 votes

def get_repository_owner_and_name() -> Tuple[str, str]:
    """
    Check the 'origin' remote to get the owner and name of the remote repository.

    :return: A tuple of the owner and name.
    """
    url = repo.remote("origin").url
    split_url = urlsplit(url)
    # Select the owner and name as regex groups
    parts = re.search(r"[:/]([^:]+)/([^/]*?)(.git)?$", split_url.path)
    if not parts:
        raise HvcsRepoParseError

    return parts.group(1), parts.group(2)

Source File: images.py From idunn with Apache License 2.0

5 votes

def get_url_remote_thumbnail(
        self, source, width=0, height=0, bestFit=True, progressive=False, animated=False
    ):
        displayErrorImage = False

        salt = self.get_salt()
        token = f"{source}{width}x{height}{salt}"
        hash = hashlib.sha256(bytes(token, encoding="utf8")).hexdigest()
        base_url = self.get_thumbr_url(hash)

        size = f"{width}x{height}"
        hashURLpart = f"{hash[0]}/{hash[1]}/{hash[2:]}"

        url_path = urlsplit(source).path
        filename = posixpath.basename(unquote(url_path))
        if not bool(re.match(r"^.*\.(jpg|jpeg|png|gif)$", filename, re.IGNORECASE)):
            filename += ".jpg"

        params = urllib.parse.urlencode(
            {
                "u": source,
                "q": 1 if displayErrorImage else 0,
                "b": 1 if bestFit else 0,
                "p": 1 if progressive else 0,
                "a": 1 if animated else 0,
            }
        )
        return base_url + "/" + size + "/" + hashURLpart + "/" + filename + "?" + params

Source File: routing.py From plugin.video.sparkle with GNU General Public License v3.0

5 votes

def run(self, argv=sys.argv):
        if len(argv) > 2:
            self.args = parse_qs(argv[2].lstrip('?'))
        path = urlsplit(argv[0]).path or '/'
        self._dispatch(path)

Source File: request.py From selenium-wire with MIT License

5 votes

def querystring(self):
        """Get the query string from the request.

        Returns:
            The query string.
        """
        return urlsplit(self.path).query

Source File: routing.py From plugin.video.sparkle with GNU General Public License v3.0

5 votes

def match(self, path):
        """
        Check if path matches this rule. Returns a dictionary of the extracted
        arguments if match, otherwise None.
        """
        # match = self._regex.search(urlsplit(path).path)
        match = self._regex.search(path)
        return match.groupdict() if match else None

Source File: web.py From teleport with Apache License 2.0

5 votes

def authenticated(method):
    """Decorate methods with this to require that the user be logged in.

    If the user is not logged in, they will be redirected to the configured
    `login url <RequestHandler.get_login_url>`.

    If you configure a login url with a query parameter, Tornado will
    assume you know what you're doing and use it as-is.  If not, it
    will add a `next` parameter so the login page knows where to send
    you once you're logged in.
    """
    @functools.wraps(method)
    def wrapper(self, *args, **kwargs):
        if not self.current_user:
            if self.request.method in ("GET", "HEAD"):
                url = self.get_login_url()
                if "?" not in url:
                    if urlparse.urlsplit(url).scheme:
                        # if login url is absolute, make next absolute too
                        next_url = self.request.full_url()
                    else:
                        next_url = self.request.uri
                    url += "?" + urlencode(dict(next=next_url))
                self.redirect(url)
                return
            raise HTTPError(403)
        return method(self, *args, **kwargs)
    return wrapper

Source File: utils.py From flask-security with MIT License

5 votes

def propagate_next(url):
    # return either URL or, if URL already has a ?next=xx, return that.
    url_next = urlsplit(url)
    qparams = parse_qs(url_next.query)
    if "next" in qparams:
        return qparams["next"][0]
    return url

Source File: utils.py From zmirror with MIT License

5 votes

def embed_real_url_to_embedded_url(real_url_raw, url_mime, escape_slash=False):
    """
    将url的参数(?q=some&foo=bar)编码到url路径中, 并在url末添加一个文件扩展名
    在某些对url参数支持不好的CDN中, 可以减少错误
    `cdn_redirect_encode_query_str_into_url`设置依赖于本函数, 详细说明可以看配置文件中的对应部分
    解码由 extract_real_url_from_embedded_url() 函数进行, 对应的例子也请看这个函数
    :rtype: str
    """
    # dbgprint(real_url_raw, url_mime, escape_slash)
    if escape_slash:
        real_url = real_url_raw.replace(r'\/', '/')
    else:
        real_url = real_url_raw
    url_sp = urlsplit(real_url)
    if not url_sp.query:  # no query, needn't rewrite
        return real_url_raw

    byte_query = url_sp.query.encode()
    if len(byte_query) > 128:  # 当查询参数太长时, 进行gzip压缩
        gzip_label = 'z'  # 进行压缩后的参数, 会在标识区中添加一个z
        byte_query = zlib.compress(byte_query)
    else:
        gzip_label = ''

    b64_query = base64.urlsafe_b64encode(byte_query).decode()
    # dbgprint(url_mime)
    mixed_path = url_sp.path + '_' + _url_salt + gzip_label + '_.' \
                 + b64_query \
                 + '._' + _url_salt + '_.' + mime_to_use_cdn[url_mime]
    result = urlunsplit((url_sp.scheme, url_sp.netloc, mixed_path, '', ''))

    if escape_slash:
        result = s_esc(result)
        # dbgprint('embed:', real_url_raw, 'to:', result)
    return result

Source File: humblebundle.py From humblebundle with GNU General Public License v3.0

5 votes

def _download_basename(self, d):
        basename = osp.basename(urlsplit(d.get('url', {}).get('web', "")).path)
        return basename

Python urllib.parse.urlsplit() Examples