Python Examples of urllib.parse.urlparse

Source File: qr_reader.py From Authenticator with GNU General Public License v2.0

11 votes

def read(self):
        try:
            from PIL import Image
            from pyzbar.pyzbar import decode
            decoded_data = decode(Image.open(self.filename))
            if path.isfile(self.filename):
                remove(self.filename)
            try:
                url = urlparse(decoded_data[0].data.decode())
                query_params = parse_qsl(url.query)
                self._codes = dict(query_params)
                return self._codes.get("secret")
            except (KeyError, IndexError):
                Logger.error("Invalid QR image")
                return None
        except ImportError:
            from ..application import Application
            Application.USE_QRSCANNER = False
            QRReader.ZBAR_FOUND = False

Source File: download.py From gog-galaxy-plugin-downloader with GNU General Public License v3.0

9 votes

def get_plugin_config(config_uri):
    """
    Downloads/opens configuration yaml file, returns
    dict of Galaxy plugins
    """
    # Try to open the URI as a URL or fall back to opening local file
    try:
        config_uri_parsed = urlparse(config_uri)
        if config_uri_parsed.scheme in ['https', 'http']:
            url = urlopen(config_uri)
            yaml_data = url.read()
        else:
            with open(config_uri, 'r') as file_data:
                yaml_data = file_data.read()
    except URLError as e:
        print(e)

    # Parse the YAML configuration
    try:
        plugin_data = yaml.safe_load(yaml_data)

        return plugin_data['plugins']
    except yaml.YAMLError as e:
        print(e)

Source File: asgi.py From quart with MIT License

7 votes

def _create_websocket_from_scope(self, send: Callable) -> Websocket:
        headers = Headers()
        headers["Remote-Addr"] = (self.scope.get("client") or ["<local>"])[0]
        for name, value in self.scope["headers"]:
            headers.add(name.decode("latin1").title(), value.decode("latin1"))

        path = self.scope["path"]
        path = path if path[0] == "/" else urlparse(path).path

        return self.app.websocket_class(
            path,
            self.scope["query_string"],
            self.scope["scheme"],
            headers,
            self.scope.get("root_path", ""),
            self.scope.get("http_version", "1.1"),
            self.scope.get("subprotocols", []),
            self.queue.get,
            partial(self.send_data, send),
            partial(self.accept_connection, send),
        )

Source File: diagnose.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0

7 votes

def test_connection(name, url, timeout=10):
    """Simple connection test"""
    urlinfo = urlparse(url)
    start = time.time()
    try:
        ip = socket.gethostbyname(urlinfo.netloc)
    except Exception as e:
        print('Error resolving DNS for {}: {}, {}'.format(name, url, e))
        return
    dns_elapsed = time.time() - start
    start = time.time()
    try:
        _ = urlopen(url, timeout=timeout)
    except Exception as e:
        print("Error open {}: {}, {}, DNS finished in {} sec.".format(name, url, e, dns_elapsed))
        return
    load_elapsed = time.time() - start
    print("Timing for {}: {}, DNS: {:.4f} sec, LOAD: {:.4f} sec.".format(name, url, dns_elapsed, load_elapsed))

Source File: sublist3r.py From subtake with GNU General Public License v2.0

6 votes

def extract_domains(self, resp):
        link_regx = re.compile('<li class="b_algo"><h2><a href="(.*?)"')
        link_regx2 = re.compile('<div class="b_title"><h2><a href="(.*?)"')
        try:
            links = link_regx.findall(resp)
            links2 = link_regx2.findall(resp)
            links_list = links + links2

            for link in links_list:
                link = re.sub('<(\/)?strong>|<span.*?>|<|>', '', link)
                if not link.startswith('http'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc
                if subdomain not in self.subdomains and subdomain != self.domain:
                    if self.verbose:
                        self.print_("%s%s: %s%s" % (R, self.engine_name, W, subdomain))
                    self.subdomains.append(subdomain.strip())
        except Exception:
            pass

        return links_list

Source File: test_requests.py From sanic with MIT License

6 votes

def test_url_attributes_with_ssl_dict(app, path, query, expected_url):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    ssl_cert = os.path.join(current_dir, "certs/selfsigned.cert")
    ssl_key = os.path.join(current_dir, "certs/selfsigned.key")

    ssl_dict = {"cert": ssl_cert, "key": ssl_key}

    async def handler(request):
        return text("OK")

    app.add_route(handler, path)

    request, response = app.test_client.get(
        f"https://{HOST}:{PORT}" + path + f"?{query}",
        server_kwargs={"ssl": ssl_dict},
    )
    assert request.url == expected_url.format(HOST, request.server_port)

    parsed = urlparse(request.url)

    assert parsed.scheme == request.scheme
    assert parsed.path == request.path
    assert parsed.query == request.query_string
    assert parsed.netloc == request.host

Source File: tiny_proxy.py From sslyze with GNU Affero General Public License v3.0

6 votes

def do_GET(self):
        (scm, netloc, path, params, query, fragment) = urlparse(self.path, "http")
        if scm != "http" or fragment or not netloc:
            self.send_error(400, "bad url %s" % self.path)
            return
        soc = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            if self._connect_to(netloc, soc):
                self.log_request()
                soc.send(
                    "%s %s %s\r\n" % (self.command, urlunparse(("", "", path, params, query, "")), self.request_version)
                )
                self.headers["Connection"] = "close"
                del self.headers["Proxy-Connection"]
                for key_val in self.headers.items():
                    soc.send("%s: %s\r\n" % key_val)
                soc.send("\r\n")
                self._read_write(soc)
        finally:
            logging.warning("Finished do_GET()")
            soc.close()
            self.connection.close()

Source File: utils.py From misp42splunk with GNU Lesser General Public License v3.0

6 votes

def extract_http_scheme_host_port(http_url):
    '''Extract scheme, host and port from a HTTP URL.

    :param http_url: HTTP URL to extract.
    :type http_url: ``string``
    :returns: A tuple of scheme, host and port
    :rtype: ``tuple``

    :raises ValueError: If `http_url` is not in http(s)://hostname:port format.
    '''

    try:
        http_info = urlparse.urlparse(http_url)
    except Exception:
        raise ValueError(
            str(http_url) + " is not in http(s)://hostname:port format")

    if not http_info.scheme or not http_info.hostname or not http_info.port:
        raise ValueError(
            http_url + " is not in http(s)://hostname:port format")

    return (http_info.scheme, http_info.hostname, http_info.port)

Source File: __init__.py From misp42splunk with GNU Lesser General Public License v3.0

6 votes

def __init__(self, splunkd_uri, session_key, schema):
        """
        Global Config.

        :param splunkd_uri:
        :param session_key:
        :param schema:
        :type schema: GlobalConfigSchema
        """
        self._splunkd_uri = splunkd_uri
        self._session_key = session_key
        self._schema = schema

        splunkd_info = urlparse(self._splunkd_uri)
        self._client = SplunkRestClient(
            self._session_key,
            self._schema.product,
            scheme=splunkd_info.scheme,
            host=splunkd_info.hostname,
            port=splunkd_info.port,
        )
        self._configuration = Configuration(self._client, self._schema)
        self._inputs = Inputs(self._client, self._schema)
        self._configs = Configs(self._client, self._schema)
        self._settings = Settings(self._client, self._schema)

Source File: promenade_client.py From drydock with Apache License 2.0

6 votes

def _get_prom_url(self):
        # Get promenade url from Keystone session object

        ks_session = self._get_ks_session()

        try:
            prom_endpoint = ks_session.get_endpoint(
                interface='internal', service_type='kubernetesprovisioner')
        except exc.EndpointNotFound:
            self.logger.error("Could not find an internal interface"
                              " defined in Keystone for Promenade")

            raise errors.DriverError("Could not find an internal interface"
                                     " defined in Keystone for Promenade")

        prom_url = urlparse(prom_endpoint)

        return prom_url

Source File: base.py From zun with Apache License 2.0

6 votes

def validate_link(self, link, bookmark=False):
        """Checks if the given link can get correct data."""
        # removes the scheme and net location parts of the link
        url_parts = list(urlparse.urlparse(link))
        url_parts[0] = url_parts[1] = ''

        # bookmark link should not have the version in the URL
        if bookmark and url_parts[2].startswith(PATH_PREFIX):
            return False

        full_path = urlparse.urlunparse(url_parts)
        try:
            self.get_json(full_path, path_prefix='')
            return True
        except Exception:
            return False

Source File: evillib.py From wafw00f with BSD 3-Clause "New" or "Revised" License

6 votes

def urlParser(target):
    log = logging.getLogger('urlparser')

    ssl = False
    o = urlparse(target)
    if o[0] not in ['http', 'https', '']:
        log.error('scheme %s not supported' % o[0])
        return
    if o[0] == 'https':
        ssl = True
    if len(o[2]) > 0:
        path = o[2]
    else:
        path = '/'
    tmp = o[1].split(':')
    if len(tmp) > 1:
        port = tmp[1]
    else:
        port = None
    hostname = tmp[0]
    query = o[4]
    return (hostname, port, path, query, ssl)

Source File: url.py From Vxscan with Apache License 2.0

6 votes

def dedup_link(urls):
    host = []
    _ = []
    furls = []
    for i in set(urls):
        # 只保留有参数的url其余的略过
        if '=' in i and not re.search(r"'|@|\+", i):
            # 　判断url是不是伪静态，伪静态与普通的去重方法不一样
            if re.search(r'/\?\d+=', i):
                furls.append(i)
            else:
                # 通过urlparse 对url进行去参去重，相同的丢弃
                url = parse.urlparse(i)
                if url.netloc + url.path not in host:
                    host.append(url.netloc + url.path)
                    _.append(i)
    _.extend(diff(furls))
    return _

Source File: requests_usbmux.py From facebook-wda with MIT License

6 votes

def get_connection(self, url, proxies=None):
        proxies = proxies or {}
        proxy = proxies.get(urlparse(url.lower()).scheme)

        if proxy:
            raise ValueError('%s does not support specifying proxies' %
                             self.__class__.__name__)

        with self.pools.lock:
            pool = self.pools.get(url)
            if pool:
                return pool

            pool = UsbmuxHTTPConnectionPool(url, self.timeout)
            self.pools[url] = pool

        return pool

Source File: asgi.py From quart with MIT License

6 votes

def _create_request_from_scope(self, send: Callable) -> Request:
        headers = Headers()
        headers["Remote-Addr"] = (self.scope.get("client") or ["<local>"])[0]
        for name, value in self.scope["headers"]:
            headers.add(name.decode("latin1").title(), value.decode("latin1"))
        if self.scope["http_version"] < "1.1":
            headers.setdefault("Host", self.app.config["SERVER_NAME"] or "")

        path = self.scope["path"]
        path = path if path[0] == "/" else urlparse(path).path

        return self.app.request_class(
            self.scope["method"],
            self.scope["scheme"],
            path,
            self.scope["query_string"],
            headers,
            self.scope.get("root_path", ""),
            self.scope["http_version"],
            max_content_length=self.app.config["MAX_CONTENT_LENGTH"],
            body_timeout=self.app.config["BODY_TIMEOUT"],
            send_push_promise=partial(self._send_push_promise, send),
            scope=self.scope,
        )

Source File: sublist3r.py From subtake with GNU General Public License v2.0

6 votes

def extract_domains(self, resp):
        link_regx = re.compile('<cite.*?>(.*?)<\/cite>')
        try:
            links_list = link_regx.findall(resp)
            for link in links_list:
                link = re.sub('<span.*>', '', link)
                if not link.startswith('http'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc
                if subdomain and subdomain not in self.subdomains and subdomain != self.domain:
                    if self.verbose:
                        self.print_("%s%s: %s%s" % (R, self.engine_name, W, subdomain))
                    self.subdomains.append(subdomain.strip())
        except Exception:
            pass
        return links_list

Source File: sublist3r.py From subtake with GNU General Public License v2.0

6 votes

def extract_domains(self, resp):
        link_regx2 = re.compile('<span class=" fz-15px fw-m fc-12th wr-bw.*?">(.*?)</span>')
        link_regx = re.compile('<span class="txt"><span class=" cite fw-xl fz-15px">(.*?)</span>')
        links_list = []
        try:
            links = link_regx.findall(resp)
            links2 = link_regx2.findall(resp)
            links_list = links + links2
            for link in links_list:
                link = re.sub("<(\/)?b>", "", link)
                if not link.startswith('http'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc
                if not subdomain.endswith(self.domain):
                    continue
                if subdomain and subdomain not in self.subdomains and subdomain != self.domain:
                    if self.verbose:
                        self.print_("%s%s: %s%s" % (R, self.engine_name, W, subdomain))
                    self.subdomains.append(subdomain.strip())
        except Exception:
            pass

        return links_list

Source File: sublist3r.py From subtake with GNU General Public License v2.0

6 votes

def extract_domains(self, resp):
        link_regx = re.compile('<p class="web-result-url">(.*?)</p>')
        try:
            links_list = link_regx.findall(resp)
            for link in links_list:
                if not link.startswith('http'):
                    link = "http://" + link
                subdomain = urlparse.urlparse(link).netloc
                if subdomain not in self.subdomains and subdomain != self.domain:
                    if self.verbose:
                        self.print_("%s%s: %s%s" % (R, self.engine_name, W, subdomain))
                    self.subdomains.append(subdomain.strip())
        except Exception:
            pass

        return links_list

Source File: connections.py From python-esppy with Apache License 2.0

6 votes

def __init__(self,session,**kwargs):
        tools.Options.__init__(self,**kwargs)

        self._session = session

        url = urlparse(self._session.conn_url)

        self._secure = False

        if url[0] == "https":
            self._secure = True

        s = url[1].split(":")

        self._host = s[0]
        self._port = s[1]

        self._websocket = None
        self._handshakeComplete = False
        self._headers = None
        self._authorization = None

Source File: port_scan.py From Vxscan with Apache License 2.0

5 votes

def pool(self):
        out = []
        try:
            # 判断给出的url是www.baiud.com还是www.baidu.com/path这种形式
            if (not parse.urlparse(self.ipaddr).path) and (parse.urlparse(self.ipaddr).path != '/'):
                self.ipaddr = self.ipaddr.replace('http://', '').replace('https://', '').rstrip('/')
            else:
                self.ipaddr = self.ipaddr.replace('http://', '').replace('https://', '').rstrip('/')
                self.ipaddr = re.sub(r'/\w+', '', self.ipaddr)
            if re.search(r'\d+\.\d+\.\d+\.\d+', self.ipaddr):
                ipaddr = self.ipaddr
            else:
                ipaddr = socket.gethostbyname(self.ipaddr)
            if ':' in ipaddr:
                ipaddr = re.sub(r':\d+', '', ipaddr)
            self.run(ipaddr)
        except Exception as e:
            pass

        if self.num == 0:
            self.save(self.ipaddr, self.out)
            for _ in self.out:
                out.append('{}:{}'.format(_.get('server'), _.get('port')))
                console('PortScan', self.ipaddr, '{}:{}\n'.format(_.get('server'), _.get('port')))
            return out
        else:
            self.save(self.ipaddr, [{"server": 'Portspoof', "port": '0', "banner": ''}])
            console('PortScan', self.ipaddr, 'Portspoof:0\n')
            return ['Portspoof:0']

Source File: file_utils.py From mrc-for-flat-nested-ner with Apache License 2.0

5 votes

def split_s3_path(url: str) -> Tuple[str, str]:
    """Split a full s3 path into the bucket name and path."""
    parsed = urlparse(url)
    if not parsed.netloc or not parsed.path:
        raise ValueError("bad s3 path {}".format(url))
    bucket_name = parsed.netloc
    s3_path = parsed.path
    # Remove '/' at beginning of path.
    if s3_path.startswith("/"):
        s3_path = s3_path[1:]
    return bucket_name, s3_path

Source File: verify.py From Vxscan with Apache License 2.0

5 votes

def verify_https(url):
    # 验证域名是http或者https的
    # 如果域名是302跳转 则获取跳转后的地址
    req = Requests()
    # noinspection PyBroadException
    if '://' in url:
        try:
            r = req.get(url)
            return url
        except Exception as e:
            pass
    host = parse_host(url)
    url2 = parse.urlparse(url)
    if url2.netloc:
        url = url2.netloc
    elif url2.path:
        url = url2.path
    # noinspection PyBroadException
    try:
        r = req.get('https://' + url)
        getattr(r, 'status_code')
        console('Verify', host, 'https://' + url + '\n')
        return 'https://' + url
    except AttributeError:
        # noinspection PyBroadException
        try:
            req.get('http://' + url)
            console('Verify', host, 'http://' + url + '\n')
            return 'http://' + url
        except Exception:
            pass
    except Exception as e:
        logging.exception(e)

Source File: file_utils.py From mrc-for-flat-nested-ner with Apache License 2.0

5 votes

def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] = None) -> str:
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
    return the path to the cached file. If it's already a local path,
    make sure the file exists and then return the path.
    """
    if cache_dir is None:
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
    if isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    parsed = urlparse(url_or_filename)

    if parsed.scheme in ('http', 'https', 's3'):
        # URL, so get it from the cache (downloading if necessary)
        return get_from_cache(url_or_filename, cache_dir)
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        return url_or_filename
    elif parsed.scheme == '':
        # File, but it doesn't exist.
        raise FileNotFoundError("file {} not found".format(url_or_filename))
    else:
        # Something unknown
        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))

Source File: url_checker.py From YaYaGen with BSD 2-Clause "Simplified" License

5 votes

def find_url_strings(self, url):
        """
        Return php pages extracted from the urls
        """
        strings = list()
        output = urlparse(url)
        if output.path.endswith(".php"):
            strings.append(output.path)
        return strings

Source File: _https_client.py From oscrypto with MIT License

5 votes

def setup_connection(self, url, timeout):
        """
        :param url:
            The URL to download

        :param timeout:
            The int number of seconds to set the timeout to

        :return:
            A boolean indicating if the connection was reused
        """

        url_info = urlparse(url)
        if url_info.scheme == 'http':
            raise HttpsClientException('Can not connect to a non-TLS server')
        hostname = url_info.hostname
        port = url_info.port
        if not port:
            port = 443

        if self.socket and self.url_info != (hostname, port):
            self.close()

        self.timeout = timeout
        self.url_info = (hostname, port)

        return self.ensure_connected()

Source File: url.py From Vxscan with Apache License 2.0

5 votes

def diff(urls):
    parms = []
    host = []
    result = []
    path = []
    # url = 'https://www.xxx.com/?page=1
    # 伪静态去重 通过urlparse取出来page=1,根据逗号拆分取出来k=page，然后保存不重复的k
    for i in urls:
        url = parse.urlparse(i)
        print(url)
        k, v, *_ = url.query.split('=')
        if url.netloc in host:
            if url.path in path:
                if k not in parms:
                    parms.append(k)
                    result.append(i)
            else:
                result.append(i)
                path.append(url.path)
        else:
            host.append(url.netloc)
            result.append(i)
            path.append(url.path)
            parms.append(k)

    return result

Source File: url.py From Vxscan with Apache License 2.0

5 votes

def parse_host(url):
    # 根据url得到主机host 例如 http://1.1.1.1:80 返回 1.1.1.1
    if (not parse.urlparse(url).path) and (parse.urlparse(url).path != '/'):
        host = url.replace('http://', '').replace('https://', '').rstrip('/')
    else:
        host = url.replace('http://', '').replace('https://', '').rstrip('/')
        host = re.sub(r'/\w+', '', host)
    if ':' in host:
        host = re.sub(r':\d+', '', host)
    return host

Source File: crawl.py From Vxscan with Apache License 2.0

5 votes

def dedup_url(urls):
    urls = list(set(urls))
    result = []
    okurl = []
    for i in urls:
        urlparse = parse.urlparse(i)
        path = urlparse.path
        if path and path.split('/')[-2]:
            key = path.split('/')[-2]
            if key not in result:
                result.append(key)
                okurl.append(i)
        else:
            okurl.append(i)
    return okurl

Source File: credentials.py From misp42splunk with GNU Lesser General Public License v3.0

5 votes

def __init__(
            self,
            splunkd_uri,
            session_key,
            endpoint
    ):
        self._splunkd_uri = splunkd_uri
        self._splunkd_info = urlparse(self._splunkd_uri)
        self._session_key = session_key
        self._endpoint = endpoint
        self._realm = '__REST_CREDENTIAL__#{base_app}#{endpoint}'.format(
            base_app=get_base_app_name(),
            endpoint=self._endpoint.internal_endpoint.strip('/')
        )

Source File: client.py From pywren-ibm-cloud with Apache License 2.0

5 votes

def invoke(self, package, action_name, payload={}, is_ow_action=False, self_invoked=False):
        """
        Invoke an IBM Cloud Function by using new request.
        """
        url = '/'.join([self.endpoint, 'api', 'v1', 'namespaces', self.namespace, 'actions', package, action_name])
        parsed_url = urlparse(url)

        try:
            if is_ow_action:
                resp = self.session.post(url, json=payload, verify=False)
                resp_status = resp.status_code
                data = resp.json()
            else:
                ctx = ssl._create_unverified_context()
                conn = http.client.HTTPSConnection(parsed_url.netloc, context=ctx)
                conn.request("POST", parsed_url.geturl(),
                             body=json.dumps(payload),
                             headers=self.headers)
                resp = conn.getresponse()
                resp_status = resp.status
                data = json.loads(resp.read().decode("utf-8"))
                conn.close()
        except Exception:
            if not is_ow_action:
                conn.close()
            if self_invoked:
                return None
            return self.invoke(package, action_name, payload, is_ow_action=is_ow_action, self_invoked=True)

        if resp_status == 202 and 'activationId' in data:
            return data["activationId"]
        elif resp_status == 429:
            return None  # "Too many concurrent requests in flight"
        else:
            logger.debug(data)
            if resp_status == 401:
                raise Exception('Unauthorized - Invalid API Key')
            elif resp_status == 404:
                raise Exception('Runtime: {} not deployed'.format(action_name))
            else:
                raise Exception(data['error'])

Python urllib.parse.urlparse() Examples