Python six.moves.urllib_parse.urlparse() Examples

The following are 30 code examples of six.moves.urllib_parse.urlparse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module six.moves.urllib_parse , or try the search function .
Example #1
Source File: general_name.py    From teleport with Apache License 2.0 6 votes vote down vote up
def _idna_encode(self, value):
        idna = _lazy_import_idna()
        parsed = urllib_parse.urlparse(value)
        if parsed.port:
            netloc = (
                idna.encode(parsed.hostname) +
                ":{}".format(parsed.port).encode("ascii")
            ).decode("ascii")
        else:
            netloc = idna.encode(parsed.hostname).decode("ascii")

        # Note that building a URL in this fashion means it should be
        # semantically indistinguishable from the original but is not
        # guaranteed to be exactly the same.
        return urllib_parse.urlunparse((
            parsed.scheme,
            netloc,
            parsed.path,
            parsed.params,
            parsed.query,
            parsed.fragment
        )) 
Example #2
Source File: http.py    From flex with MIT License 6 votes vote down vote up
def data(self):
        """
        TODO: What is the right way to do this?
        """
        if not self.body:
            return self.body
        elif self.body is EMPTY:
            return EMPTY
        elif self.content_type and self.content_type.startswith('application/json'):
            try:
                if isinstance(self.body, six.binary_type):
                    return json.loads(self.body.decode('utf-8'))
                else:
                    return json.loads(self.body)
            except ValueError as e:
                if isinstance(e, JSONDecodeError):
                    # this will only be True for Python3+
                    raise e
                raise JSONDecodeError(str(e))
        elif self.content_type == 'application/x-www-form-urlencoded':
            return dict(urlparse.parse_qsl(self.body))
        else:
            raise NotImplementedError("No parser for content type") 
Example #3
Source File: oauth.py    From Dailyfresh-B2C with Apache License 2.0 6 votes vote down vote up
def handle_state(self, start_url, target_url):
        start_query = parse_qs(urlparse(start_url).query)
        redirect_uri = start_query.get('redirect_uri')

        if getattr(self.backend, 'STATE_PARAMETER', False):
            if start_query.get('state'):
                target_url = url_add_parameters(target_url, {
                    'state': start_query['state']
                })

        if redirect_uri and getattr(self.backend, 'REDIRECT_STATE', False):
            redirect_query = parse_qs(urlparse(redirect_uri).query)
            if redirect_query.get('redirect_state'):
                target_url = url_add_parameters(target_url, {
                    'redirect_state': redirect_query['redirect_state']
                })
        return target_url 
Example #4
Source File: general_name.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _idna_encode(self, value):
        idna = _lazy_import_idna()
        parsed = urllib_parse.urlparse(value)
        if parsed.port:
            netloc = (
                idna.encode(parsed.hostname) +
                ":{}".format(parsed.port).encode("ascii")
            ).decode("ascii")
        else:
            netloc = idna.encode(parsed.hostname).decode("ascii")

        # Note that building a URL in this fashion means it should be
        # semantically indistinguishable from the original but is not
        # guaranteed to be exactly the same.
        return urllib_parse.urlunparse((
            parsed.scheme,
            netloc,
            parsed.path,
            parsed.params,
            parsed.query,
            parsed.fragment
        )) 
Example #5
Source File: test_saml.py    From Dailyfresh-B2C with Apache License 2.0 6 votes vote down vote up
def do_start(self):
        start_url = self.backend.start().url
        # Modify the start URL to make the SAML request consistent
        # from test to test:
        start_url = self.modify_start_url(start_url)
        # If the SAML Identity Provider recognizes the user, we will
        # be redirected back to:
        return_url = self.backend.redirect_uri
        self.install_http_intercepts(start_url, return_url)
        response = requests.get(start_url)
        self.assertTrue(response.url.startswith(return_url))
        self.assertEqual(response.text, 'foobar')
        query_values = dict((k, v[0]) for k, v in
                            parse_qs(urlparse(response.url).query).items())
        self.assertNotIn(' ', query_values['SAMLResponse'])
        self.strategy.set_request_data(query_values, self.backend)
        return self.backend.complete() 
Example #6
Source File: test_saml.py    From Dailyfresh-B2C with Apache License 2.0 6 votes vote down vote up
def modify_start_url(self, start_url):
        """
        Given a SAML redirect URL, parse it and change the ID to
        a consistent value, so the request is always identical.
        """
        # Parse the SAML Request URL to get the XML being sent to TestShib
        url_parts = urlparse(start_url)
        query = dict((k, v[0]) for (k, v) in
                     parse_qs(url_parts.query).items())
        xml = OneLogin_Saml2_Utils.decode_base64_and_inflate(
            query['SAMLRequest']
        )
        # Modify the XML:
        xml = xml.decode()
        xml, changed = re.subn(r'ID="[^"]+"', 'ID="TEST_ID"', xml)
        self.assertEqual(changed, 1)
        # Update the URL to use the modified query string:
        query['SAMLRequest'] = OneLogin_Saml2_Utils.deflate_and_base64_encode(
            xml
        )
        url_parts = list(url_parts)
        url_parts[4] = urlencode(query)
        return urlunparse(url_parts) 
Example #7
Source File: __init__.py    From flex with MIT License 6 votes vote down vote up
def validate_deferred_references(schema, context, **kwargs):
    try:
        deferred_references = context['deferred_references']
    except KeyError:
        raise KeyError("`deferred_references` not found in context")

    with ErrorDict() as errors:
        for reference in deferred_references:
            parts = urlparse.urlparse(reference)
            if any((parts.scheme, parts.netloc, parts.path, parts.params, parts.query)):
                errors.add_error(
                    reference,
                    MESSAGES['reference']['unsupported'].format(reference),
                )
                continue
            try:
                jsonpointer.resolve_pointer(schema, parts.fragment)
            except jsonpointer.JsonPointerException:
                errors.add_error(
                    reference,
                    MESSAGES['reference']['undefined'].format(reference),
                ) 
Example #8
Source File: example_utils.py    From taskflow with Apache License 2.0 6 votes vote down vote up
def _make_conf(backend_uri):
    parsed_url = urllib_parse.urlparse(backend_uri)
    backend_type = parsed_url.scheme.lower()
    if not backend_type:
        raise ValueError("Unknown backend type for uri: %s" % (backend_type))
    if backend_type in ('file', 'dir'):
        conf = {
            'path': parsed_url.path,
            'connection': backend_uri,
        }
    elif backend_type in ('zookeeper',):
        conf = {
            'path': parsed_url.path,
            'hosts': parsed_url.netloc,
            'connection': backend_uri,
        }
    else:
        conf = {
            'connection': backend_uri,
        }
    return conf 
Example #9
Source File: oauth2.py    From spotipy with MIT License 6 votes vote down vote up
def get_auth_response(self):
        logger.info('User authentication requires interaction with your '
                    'web browser. Once you enter your credentials and '
                    'give authorization, you will be redirected to '
                    'a url.  Paste that url you were directed to to '
                    'complete the authorization.')

        redirect_info = urlparse(self.redirect_uri)
        redirect_host, redirect_port = get_host_port(redirect_info.netloc)

        if redirect_host in ("127.0.0.1", "localhost") and redirect_info.scheme == "http":
            # Only start a local http server if a port is specified
            if redirect_port:
                return self._get_auth_response_local_server(redirect_port)
            else:
                logger.warning('Using `%s` as redirect URI without a port. '
                               'Specify a port (e.g. `%s:8080`) to allow '
                               'automatic retrieval of authentication code '
                               'instead of having to copy and paste '
                               'the URL your browser is redirected to.',
                               redirect_host, redirect_host)

        logger.info('Paste that url you were directed to in order to '
                    'complete the authorization')
        return self._get_auth_response_interactive() 
Example #10
Source File: handlers.py    From jupyter_http_over_ws with Apache License 2.0 6 votes vote down vote up
def _attach_auth_cookies(self):
    auth_url = self.get_argument(_AUTH_URL_QUERY_PARAM, default='')
    if not auth_url:
      raise gen.Return({})

    parsed_auth_url = urlparse.urlparse(auth_url)

    try:
      _validate_same_domain(self.request, parsed_auth_url)
      extra_cookies = yield _perform_request_and_extract_cookies(
          parsed_auth_url, self.ca_certs, self._get_http_client())
    except Exception as e:  # pylint:disable=broad-except
      self._on_unhandled_exception(e)
      raise

    self.request.headers.update(extra_cookies) 
Example #11
Source File: general_name.py    From teleport with Apache License 2.0 6 votes vote down vote up
def _idna_encode(self, value):
        idna = _lazy_import_idna()
        parsed = urllib_parse.urlparse(value)
        if parsed.port:
            netloc = (
                idna.encode(parsed.hostname) +
                ":{}".format(parsed.port).encode("ascii")
            ).decode("ascii")
        else:
            netloc = idna.encode(parsed.hostname).decode("ascii")

        # Note that building a URL in this fashion means it should be
        # semantically indistinguishable from the original but is not
        # guaranteed to be exactly the same.
        return urllib_parse.urlunparse((
            parsed.scheme,
            netloc,
            parsed.path,
            parsed.params,
            parsed.query,
            parsed.fragment
        )) 
Example #12
Source File: general_name.py    From teleport with Apache License 2.0 6 votes vote down vote up
def _idna_encode(self, value):
        parsed = urllib_parse.urlparse(value)
        if parsed.port:
            netloc = (
                idna.encode(parsed.hostname) +
                ":{0}".format(parsed.port).encode("ascii")
            ).decode("ascii")
        else:
            netloc = idna.encode(parsed.hostname).decode("ascii")

        # Note that building a URL in this fashion means it should be
        # semantically indistinguishable from the original but is not
        # guaranteed to be exactly the same.
        return urllib_parse.urlunparse((
            parsed.scheme,
            netloc,
            parsed.path,
            parsed.params,
            parsed.query,
            parsed.fragment
        )) 
Example #13
Source File: helpers.py    From script.module.resolveurl with GNU General Public License v2.0 6 votes vote down vote up
def get_media_url(url, result_blacklist=None, patterns=None, generic_patterns=True):
    if patterns is None:
        patterns = []
    scheme = urllib_parse.urlparse(url).scheme
    if result_blacklist is None:
        result_blacklist = []
    elif isinstance(result_blacklist, str):
        result_blacklist = [result_blacklist]

    result_blacklist = list(set(result_blacklist + ['.smil']))  # smil(not playable) contains potential sources, only blacklist when called from here
    net = common.Net()
    headers = {'User-Agent': common.RAND_UA}
    headers.update({'Referer': url})
    response = net.http_GET(url, headers=headers)
    response_headers = response.get_headers(as_dict=True)
    cookie = response_headers.get('Set-Cookie', None)
    if cookie:
        headers.update({'Cookie': cookie})
    html = response.content

    source_list = scrape_sources(html, result_blacklist, scheme, patterns, generic_patterns)
    source = pick_source(source_list)
    return source + append_headers(headers) 
Example #14
Source File: ctl.py    From patroni with MIT License 6 votes vote down vote up
def parse_dcs(dcs):
    if dcs is None:
        return None
    elif '//' not in dcs:
        dcs = '//' + dcs

    parsed = urlparse(dcs)
    scheme = parsed.scheme
    port = int(parsed.port) if parsed.port else None

    if scheme == '':
        scheme = ([k for k, v in DCS_DEFAULTS.items() if v['port'] == port] or ['etcd'])[0]
    elif scheme not in DCS_DEFAULTS:
        raise PatroniCtlException('Unknown dcs scheme: {}'.format(scheme))

    default = DCS_DEFAULTS[scheme]
    return yaml.safe_load(default['template'].format(host=parsed.hostname or 'localhost', port=port or default['port'])) 
Example #15
Source File: handlers.py    From jupyter_http_over_ws with Apache License 2.0 6 votes vote down vote up
def _attach_auth_cookies(self):
    auth_url = self.get_argument(_AUTH_URL_QUERY_PARAM, default='')
    if not auth_url:
      raise gen.Return()

    parsed_auth_url = urlparse.urlparse(auth_url)

    try:
      _validate_same_domain(self.request, parsed_auth_url)
      extra_cookies = yield _perform_request_and_extract_cookies(
          parsed_auth_url, self.ca_certs, self._get_http_client())
    except Exception:  # pylint:disable=broad-except
      self.log.exception('Uncaught error when proxying request')
      raise

    self.request.headers.update(extra_cookies) 
Example #16
Source File: general_name.py    From quickstart-git2s3 with Apache License 2.0 6 votes vote down vote up
def _idna_encode(self, value):
        parsed = urllib_parse.urlparse(value)
        if parsed.port:
            netloc = (
                idna.encode(parsed.hostname) +
                ":{0}".format(parsed.port).encode("ascii")
            ).decode("ascii")
        else:
            netloc = idna.encode(parsed.hostname).decode("ascii")

        # Note that building a URL in this fashion means it should be
        # semantically indistinguishable from the original but is not
        # guaranteed to be exactly the same.
        return urllib_parse.urlunparse((
            parsed.scheme,
            netloc,
            parsed.path,
            parsed.params,
            parsed.query,
            parsed.fragment
        )) 
Example #17
Source File: helpers.py    From script.module.urlresolver with GNU General Public License v2.0 6 votes vote down vote up
def get_media_url(url, result_blacklist=None, patterns=None, generic_patterns=True):
    if patterns is None:
        patterns = []
    scheme = urllib_parse.urlparse(url).scheme
    if result_blacklist is None:
        result_blacklist = []
    elif isinstance(result_blacklist, str):
        result_blacklist = [result_blacklist]

    result_blacklist = list(set(result_blacklist + ['.smil']))  # smil(not playable) contains potential sources, only blacklist when called from here
    net = common.Net()
    headers = {'User-Agent': common.RAND_UA}
    headers.update({'Referer': url})
    response = net.http_GET(url, headers=headers)
    response_headers = response.get_headers(as_dict=True)
    cookie = response_headers.get('Set-Cookie', None)
    if cookie:
        headers.update({'Cookie': cookie})
    html = response.content

    source_list = scrape_sources(html, result_blacklist, scheme, patterns, generic_patterns)
    source = pick_source(source_list)
    return source + append_headers(headers) 
Example #18
Source File: query.py    From pagure with GNU General Public License v2.0 6 votes vote down vote up
def save_report(session, repo, name, url, username):
    """ Save the report of issues based on the given URL of the project.
    """
    url_obj = urlparse(url)
    url = url_obj.geturl().replace(url_obj.query, "")
    query = {}
    for k, v in parse_qsl(url_obj.query):
        if k in query:
            if isinstance(query[k], list):
                query[k].append(v)
            else:
                query[k] = [query[k], v]
        else:
            query[k] = v
    reports = repo.reports
    reports[name] = query
    repo.reports = reports
    session.add(repo) 
Example #19
Source File: __init__.py    From patroni with MIT License 5 votes vote down vote up
def parse_connection_string(value):
    """Original Governor stores connection strings for each cluster members if a following format:
        postgres://{username}:{password}@{connect_address}/postgres
    Since each of our patroni instances provides own REST API endpoint it's good to store this information
    in DCS among with postgresql connection string. In order to not introduce new keys and be compatible with
    original Governor we decided to extend original connection string in a following way:
        postgres://{username}:{password}@{connect_address}/postgres?application_name={api_url}
    This way original Governor could use such connection string as it is, because of feature of `libpq` library.

    This method is able to split connection string stored in DCS into two parts, `conn_url` and `api_url`"""

    scheme, netloc, path, params, query, fragment = urlparse(value)
    conn_url = urlunparse((scheme, netloc, path, params, '', fragment))
    api_url = ([v for n, v in parse_qsl(query) if n == 'application_name'] or [None])[0]
    return conn_url, api_url 
Example #20
Source File: oauth2.py    From spotipy with MIT License 5 votes vote down vote up
def get_auth_response(self, state=None):
        """ Gets a new auth **token** with user interaction """
        logger.info('User authentication requires interaction with your '
                    'web browser. Once you enter your credentials and '
                    'give authorization, you will be redirected to '
                    'a url.  Paste that url you were directed to to '
                    'complete the authorization.')

        redirect_info = urlparse(self.redirect_uri)
        redirect_host, redirect_port = get_host_port(redirect_info.netloc)
        # Implicit Grant tokens are returned in a hash fragment
        # which is only available to the browser. Therefore, interactive
        # URL retrival is required.
        if (redirect_host in ("127.0.0.1", "localhost")
                and redirect_info.scheme == "http" and redirect_port):
            logger.warning('Using a local redirect URI with a '
                           'port, likely expecting automatic '
                           'retrieval. Due to technical limitations, '
                           'the authentication token cannot be '
                           'automatically retrieved and must be '
                           'copied and pasted.')

        self._open_auth_url(state)
        logger.info('Paste that url you were directed to in order to '
                    'complete the authorization')
        response = SpotifyImplicitGrant._get_user_input("Enter the URL you "
                                                        "were redirected to: ")
        return self.parse_response_token(response, state) 
Example #21
Source File: sanitizer.py    From ImageFusion with MIT License 5 votes vote down vote up
def allowed_token(self, token, token_type):
        if "data" in token:
            attrs = dict([(name, val) for name, val in
                          token["data"][::-1]
                          if name in self.allowed_attributes])
            for attr in self.attr_val_is_uri:
                if attr not in attrs:
                    continue
                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
                uri = urlparse.urlparse(val_unescaped)
                if uri:
                    if uri.scheme not in self.allowed_protocols:
                        del attrs[attr]
                    if uri.scheme == 'data':
                        m = content_type_rgx.match(uri.path)
                        if not m:
                            del attrs[attr]
                        if m.group('content_type') not in self.allowed_content_types:
                            del attrs[attr]

            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                         ' ',
                                         unescape(attrs[attr]))
            if (token["name"] in self.svg_allow_local_href and
                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                    attrs['xlink:href'])):
                del attrs['xlink:href']
            if 'style' in attrs:
                attrs['style'] = self.sanitize_css(attrs['style'])
            token["data"] = [[name, val] for name, val in list(attrs.items())]
        return token 
Example #22
Source File: webtest.py    From cheroot with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def strip_netloc(url):
    """Return absolute-URI path from URL.

    Strip the scheme and host from the URL, returning the
    server-absolute portion.

    Useful for wrapping an absolute-URI for which only the
    path is expected (such as in calls to :py:meth:`WebCase.getPage`).

    >>> strip_netloc('https://google.com/foo/bar?bing#baz')
    '/foo/bar?bing'

    >>> strip_netloc('//google.com/foo/bar?bing#baz')
    '/foo/bar?bing'

    >>> strip_netloc('/foo/bar?bing#baz')
    '/foo/bar?bing'
    """
    parsed = urllib_parse.urlparse(url)
    scheme, netloc, path, params, query, fragment = parsed
    stripped = '', '', path, params, query, ''
    return urllib_parse.urlunparse(stripped)


# Add any exceptions which your web framework handles
# normally (that you don't want server_error to trap). 
Example #23
Source File: sanitizer.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def allowed_token(self, token, token_type):
        if "data" in token:
            attrs = dict([(name, val) for name, val in
                          token["data"][::-1]
                          if name in self.allowed_attributes])
            for attr in self.attr_val_is_uri:
                if attr not in attrs:
                    continue
                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
                uri = urlparse.urlparse(val_unescaped)
                if uri:
                    if uri.scheme not in self.allowed_protocols:
                        del attrs[attr]
                    if uri.scheme == 'data':
                        m = content_type_rgx.match(uri.path)
                        if not m:
                            del attrs[attr]
                        if m.group('content_type') not in self.allowed_content_types:
                            del attrs[attr]

            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                         ' ',
                                         unescape(attrs[attr]))
            if (token["name"] in self.svg_allow_local_href and
                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                    attrs['xlink:href'])):
                del attrs['xlink:href']
            if 'style' in attrs:
                attrs['style'] = self.sanitize_css(attrs['style'])
            token["data"] = [[name, val] for name, val in list(attrs.items())]
        return token 
Example #24
Source File: sanitizer.py    From datafari with Apache License 2.0 5 votes vote down vote up
def allowed_token(self, token, token_type):
        if "data" in token:
            attrs = dict([(name, val) for name, val in
                          token["data"][::-1]
                          if name in self.allowed_attributes])
            for attr in self.attr_val_is_uri:
                if attr not in attrs:
                    continue
                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
                uri = urlparse.urlparse(val_unescaped)
                if uri:
                    if uri.scheme not in self.allowed_protocols:
                        del attrs[attr]
                    if uri.scheme == 'data':
                        m = content_type_rgx.match(uri.path)
                        if not m:
                            del attrs[attr]
                        if m.group('content_type') not in self.allowed_content_types:
                            del attrs[attr]

            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                         ' ',
                                         unescape(attrs[attr]))
            if (token["name"] in self.svg_allow_local_href and
                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                    attrs['xlink:href'])):
                del attrs['xlink:href']
            if 'style' in attrs:
                attrs['style'] = self.sanitize_css(attrs['style'])
            token["data"] = [[name, val] for name, val in list(attrs.items())]
        return token 
Example #25
Source File: videozoo.py    From script.module.resolveurl with GNU General Public License v2.0 5 votes vote down vote up
def _redirect_test(self, url):
        opener = urllib_request.build_opener()
        opener.addheaders = [('User-agent', common.IOS_USER_AGENT)]
        opener.addheaders = [('Referer', urllib_parse.urlparse(url).netloc)]
        try:
            resp = opener.open(url)
            if url != resp.geturl():
                return resp.geturl()
            else:
                return url
        except urllib_error.HTTPError as e:
            if e.code == 403:
                if url != e.geturl():
                    return e.geturl()
            raise ResolverError('File not found') 
Example #26
Source File: simplydebrid.py    From script.module.resolveurl with GNU General Public License v2.0 5 votes vote down vote up
def valid_url(self, url, host):
        if not self.hosts:
            self.hosts = self.get_all_hosters()

        if url:
            try:
                host = urllib_parse.urlparse(url).hostname
            except:
                host = 'unknown'
        if host.startswith('www.'):
            host = host.replace('www.', '')
        if any(host in item for item in self.hosts):
            return True

        return False 
Example #27
Source File: hmf.py    From script.module.resolveurl with GNU General Public License v2.0 5 votes vote down vote up
def __top_domain(self, url):
        elements = urllib_parse.urlparse(url)
        domain = elements.netloc or elements.path
        domain = domain.split('@')[-1].split(':')[0]
        regex = r"(?:www\.)?([\w\-]*\.[\w\-]{2,5}(?:\.[\w\-]{2,5})?)$"
        res = re.search(regex, domain)
        if res:
            domain = res.group(1)
        domain = domain.lower()
        return domain 
Example #28
Source File: __init__.py    From script.module.resolveurl with GNU General Public License v2.0 5 votes vote down vote up
def scrape_supported(html, regex=None, host_only=False):
    """
    returns a list of links scraped from the html that are supported by resolveurl

    args:
        html: the html to be scraped
        regex: an optional argument to override the default regex which is: href *= *["']([^'"]+
        host_only: an optional argument if true to do only host validation vs full url validation (default False)

    Returns:
        a list of links scraped from the html that passed validation

    """
    if regex is None:
        regex = r'''href\s*=\s*['"]([^'"]+)'''
    links = []
    for match in re.finditer(regex, html):
        stream_url = match.group(1)
        host = urllib_parse.urlparse(stream_url).hostname
        if host_only:
            if host is None:
                continue

            if host in host_cache:
                if host_cache[host]:
                    links.append(stream_url)
                continue
            else:
                hmf = HostedMediaFile(host=host, media_id='dummy')  # use dummy media_id to allow host validation
        else:
            hmf = HostedMediaFile(url=stream_url)

        is_valid = hmf.valid_url()
        host_cache[host] = is_valid
        if is_valid:
            links.append(stream_url)
    return links 
Example #29
Source File: sanitizer.py    From ImageFusion with MIT License 5 votes vote down vote up
def allowed_token(self, token, token_type):
        if "data" in token:
            attrs = dict([(name, val) for name, val in
                          token["data"][::-1]
                          if name in self.allowed_attributes])
            for attr in self.attr_val_is_uri:
                if attr not in attrs:
                    continue
                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
                uri = urlparse.urlparse(val_unescaped)
                if uri:
                    if uri.scheme not in self.allowed_protocols:
                        del attrs[attr]
                    if uri.scheme == 'data':
                        m = content_type_rgx.match(uri.path)
                        if not m:
                            del attrs[attr]
                        if m.group('content_type') not in self.allowed_content_types:
                            del attrs[attr]

            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                         ' ',
                                         unescape(attrs[attr]))
            if (token["name"] in self.svg_allow_local_href and
                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                    attrs['xlink:href'])):
                del attrs['xlink:href']
            if 'style' in attrs:
                attrs['style'] = self.sanitize_css(attrs['style'])
            token["data"] = [[name, val] for name, val in list(attrs.items())]
        return token 
Example #30
Source File: proxy.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def parse_url(cls, url):
        parsed = urlparse(url)
        return cls(proxy_type=parsed.scheme,
                   proxy_address=parsed.hostname,
                   proxy_port=parsed.port,
                   proxy_login=parsed.username,
                   proxy_password=parsed.password)