Python split url

Source File: utils.py From pywren-ibm-cloud with Apache License 2.0

7 votes

def split_object_url(obj_url):
    if '://' in obj_url:
        sb, path = obj_url.split('://')
    else:
        sb = None
        path = obj_url

    sb = 'ibm_cos' if sb == 'cos' else sb

    bucket, full_key = path.split('/', 1) if '/' in path else (path, '')

    if full_key.endswith('/'):
        prefix = full_key
        obj_name = ''
    elif full_key:
        prefix, obj_name = full_key.rsplit('/', 1) if '/' in full_key else ('', full_key)
    else:
        prefix = ''
        obj_name = ''

    return sb, bucket, prefix, obj_name

Source File: data.py From razzy-spinner with GNU General Public License v3.0

6 votes

def split_resource_url(resource_url):
    """
    Splits a resource url into "<protocol>:<path>".

    >>> windows = sys.platform.startswith('win')
    >>> split_resource_url('nltk:home/nltk')
    ('nltk', 'home/nltk')
    >>> split_resource_url('nltk:/home/nltk')
    ('nltk', '/home/nltk')
    >>> split_resource_url('file:/home/nltk')
    ('file', '/home/nltk')
    >>> split_resource_url('file:///home/nltk')
    ('file', '/home/nltk')
    >>> split_resource_url('file:///C:/home/nltk')
    ('file', '/C:/home/nltk')
    """
    protocol, path_ = resource_url.split(':', 1)
    if protocol == 'nltk':
        pass
    elif protocol == 'file':
        if path_.startswith('/'):
            path_ = '/' + path_.lstrip('/')
    else:
        path_ = re.sub(r'^/{0,2}', '', path_)
    return protocol, path_

Source File: globalfunctions.py From aeneas with GNU Affero General Public License v3.0

6 votes

def split_url(url):
    """
    Split the given URL ``base#anchor`` into ``(base, anchor)``,
    or ``(base, None)`` if no anchor is present.

    In case there are two or more ``#`` characters,
    return only the first two tokens: ``a#b#c => (a, b)``.

    :param string url: the url
    :rtype: list of str
    """
    if url is None:
        return (None, None)
    array = url.split("#")
    if len(array) == 1:
        array.append(None)
    return tuple(array[0:2])

Source File: split_url_events.py From aw-core with Mozilla Public License 2.0

6 votes

def split_url_events(events: List[Event]) -> List[Event]:
    for event in events:
        if "url" in event.data:
            url = event.data["url"]
            parsed_url = urlparse(url)
            event.data["$protocol"] = parsed_url.scheme
            event.data["$domain"] = (
                parsed_url.netloc[4:]
                if parsed_url.netloc[:4] == "www."
                else parsed_url.netloc
            )
            event.data["$path"] = parsed_url.path
            event.data["$params"] = parsed_url.params
            event.data["$options"] = parsed_url.query
            event.data["$identifier"] = parsed_url.fragment
            # TODO: Parse user, port etc aswell
    return events

Source File: misc.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def split_auth_netloc_from_url(url):
    # type: (str) -> Tuple[str, str, Tuple[str, str]]
    """
    Parse a url into separate netloc, auth, and url with no auth.

    Returns: (url_without_auth, netloc, (username, password))
    """
    url_without_auth, (netloc, auth) = _transform_url(url, _get_netloc)
    return url_without_auth, netloc, auth

Source File: s3.py From cli with MIT License

5 votes

def split_url(url: urllib.parse.ParseResult) -> Tuple[S3Bucket, str]:
    """
    Splits the given s3:// *url* into a Bucket object and normalized path
    with some sanity checking.
    """
    # Require a bucket name
    if not url.netloc:
        raise UserError("No bucket name specified in url (%s)" % url.geturl())

    # Remove leading slashes from any destination path in order to use it as a
    # prefix for uploaded files.  Internal and trailing slashes are untouched.
    prefix = url.path.lstrip("/")

    try:
        bucket = boto3.resource("s3").Bucket(url.netloc)

    except (NoCredentialsError, PartialCredentialsError) as error:
        raise UserError("Unable to authenticate with S3: %s" % error) from error

    # Find the bucket and ensure we have access and that it already exists so
    # we don't automagically create new buckets.
    try:
        boto3.client("s3").head_bucket(Bucket = bucket.name)

    except ClientError as error:
        raise UserError(dedent('''\
            No bucket exists with the name "%s".

            Buckets are not automatically created for safety reasons.
            ''' % bucket.name))

    return bucket, prefix

Source File: browser.py From FODI with GNU General Public License v3.0

5 votes

def split_url(url):
    url_splited = urlparse(url)
    return {
        'path': url_splited.path,
        'params': dict(parse_qsl(url_splited.query))
    }

Source File: wrenutil.py From pywren with Apache License 2.0

5 votes

def split_s3_url(s3_url):
    if s3_url[:5] != "s3://":
        raise ValueError("URL {} is not valid".format(s3_url))


    splits = s3_url[5:].split("/")
    bucket_name = splits[0]
    key = "/".join(splits[1:])
    return bucket_name, key

Source File: plugin.py From robotframework-seleniumtestability with Apache License 2.0

5 votes

def split_url_to_host_and_path(url: str) -> dict:
        """
        Returs given url as dict with property "base" set to a protocol and hostname and "path" as the trailing path.
        This is useful when constructing requests sessions from urls used within SeleniumLibrary.
        """
        data = furl(url)
        return {"base": str(data.copy().remove(path=True)), "path": str(data.path)}

Source File: utils.py From oreilly-flask-apis-video with MIT License

5 votes

def split_url(url, method='GET'):
    """Returns the endpoint name and arguments that match a given URL. In
    other words, this is the reverse of Flask's url_for()."""
    appctx = _app_ctx_stack.top
    reqctx = _request_ctx_stack.top
    if appctx is None:
        raise RuntimeError('Attempted to match a URL without the '
                           'application context being pushed. This has to be '
                           'executed when application context is available.')

    if reqctx is not None:
        url_adapter = reqctx.url_adapter
    else:
        url_adapter = appctx.url_adapter
        if url_adapter is None:
            raise RuntimeError('Application was not able to create a URL '
                               'adapter for request independent URL matching. '
                               'You might be able to fix this by setting '
                               'the SERVER_NAME config variable.')
    parsed_url = url_parse(url)
    if parsed_url.netloc is not '' and \
                    parsed_url.netloc != url_adapter.server_name:
        raise ValidationError('Invalid URL: ' + url)
    try:
        result = url_adapter.match(parsed_url.path, method)
    except NotFound:
        raise ValidationError('Invalid URL: ' + url)
    return result

Source File: bulkload_client.py From python-compat-runtime with Apache License 2.0

5 votes

def SplitURL(url):
  """Splits an HTTP URL into pieces.

  Args:
    url: String containing a full URL string (e.g.,
      'http://blah.com:8080/stuff?param=1#foo')

  Returns:
    Tuple (netloc, uri) where:
      netloc: String containing the host/port combination from the URL. The
        port is optional. (e.g., 'blah.com:8080').
      uri: String containing the relative URI of the URL. (e.g., '/stuff').
  """
  scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
  return netloc, path

Source File: net.py From luci-py with Apache License 2.0

5 votes

def split_server_request_url(url):
  """Splits the url into scheme+netloc and path+params+query+fragment."""
  url_parts = list(urllib.parse.urlparse(url))
  urlhost = '%s://%s' % (url_parts[0], url_parts[1])
  urlpath = urllib.parse.urlunparse(['', ''] + url_parts[2:])
  return urlhost, urlpath

Source File: file_utils.py From flux-ci with MIT License

5 votes

def split_url_path(path):
  """
  Separates URL path to repository name and path.

  # Parameters
  path (str): The path from URL.

  # Return
  tuple (str, str): The repository name and the path to be listed.
  """

  separator = '/'
  parts = path.split(separator)
  return separator.join(parts[0:2]), separator.join(parts[2:])

Source File: common.py From hivemind with MIT License

5 votes

def split_url(url, allow_empty=False):
    """Validate and split a post url into author/permlink."""
    if not url:
        assert allow_empty, 'url must be specified'
        return None
    assert isinstance(url, str), 'url must be a string'

    parts = url.split('/')
    assert len(parts) == 2, 'invalid url parts'

    author = valid_account(parts[0])
    permlink = valid_permlink(parts[1])
    return (author, permlink)

Source File: censys_search.py From CloudBunny with MIT License

5 votes

def split_url(url):
    if re.match(r'http(s?)\:', url):
        parsed = urlsplit(url)
        return parsed.netloc
    else:
        return url

Source File: http.py From honeything with GNU General Public License v3.0

5 votes

def SplitUrl(url):
  Url = collections.namedtuple('Url', ('method host port path'))
  method, rest = urllib.splittype(url)
  hostport, path = urllib.splithost(rest)
  host, port = urllib.splitport(hostport)
  return Url(method, host, int(port or 0), path)

Source File: utils.py From fence with Apache License 2.0

5 votes

def split_url_and_query_params(url):
    scheme, netloc, path, query_string, fragment = urlsplit(url)
    query_params = parse_qs(query_string)
    url = urlunsplit((scheme, netloc, path, None, fragment))
    return url, query_params

Source File: boltdepot.py From cqparts with Apache License 2.0

5 votes

def split_url(url):
    match = re.search(r'^(?P<base>.*)\?(?P<params>.*)$', url, flags=re.I)
    return (
        match.group('base'),
        {k: v for (k, v) in (p.split('=') for p in match.group('params').split('&'))}
    )

Source File: dsio.py From xcube with MIT License

5 votes

def split_obs_url(path: str) -> Tuple[Optional[str], str]:
    """
    If *path* is a URL, return tuple (endpoint_url, root), otherwise (None, *path*)
    """
    url = urllib3.util.parse_url(path)
    if all((url.scheme, url.host, url.path)) and url.scheme != 's3':
        if url.port is not None:
            endpoint_url = f'{url.scheme}://{url.host}:{url.port}'
        else:
            endpoint_url = f'{url.scheme}://{url.host}'
        root = url.path
        if root.startswith('/'):
            root = root[1:]
        return endpoint_url, root
    return None, path

Source File: util.py From Computable with MIT License

5 votes

def split_url(url):
    """split a zmq url (tcp://ip:port) into ('tcp','ip','port')."""
    proto_addr = url.split('://')
    assert len(proto_addr) == 2, 'Invalid url: %r'%url
    proto, addr = proto_addr
    lis = addr.split(':')
    assert len(lis) == 2, 'Invalid url: %r'%url
    addr,s_port = lis
    return proto,addr,s_port

Source File: tools.py From JobFunnel with MIT License

5 votes

def split_url(url):
    # capture protocol, ip address and port from given url
    match = re.match(r'^(http[s]?):\/\/([A-Za-z0-9.]+):([0-9]+)?(.*)$', url)

    # if not all groups have a match, match will be None
    if match is not None:
        return {
            'protocol': match.group(1),
            'ip_address': match.group(2),
            'port': match.group(3),
        }
    else:
        return None

Source File: utils.py From URLNet with Apache License 2.0

4 votes

def split_url(line, part):
    if line.startswith("http://"):
        line=line[7:]
    if line.startswith("https://"):
        line=line[8:]
    if line.startswith("ftp://"):
        line=line[6:]
    if line.startswith("www."):
        line = line[4:]
    slash_pos = line.find('/')
    if slash_pos > 0 and slash_pos < len(line)-1: # line = "fsdfsdf/sdfsdfsd"
        primarydomain = line[:slash_pos]
        path_argument = line[slash_pos+1:]
        path_argument_tokens = path_argument.split('/')
        pathtoken = "/".join(path_argument_tokens[:-1])
        last_pathtoken = path_argument_tokens[-1]
        if len(path_argument_tokens) > 2 and last_pathtoken == '':
            pathtoken = "/".join(path_argument_tokens[:-2])
            last_pathtoken = path_argument_tokens[-2]
        question_pos = last_pathtoken.find('?')
        if question_pos != -1:
            argument = last_pathtoken[question_pos+1:]
            pathtoken = pathtoken + "/" + last_pathtoken[:question_pos]     
        else:
            argument = ""
            pathtoken = pathtoken + "/" + last_pathtoken          
        last_slash_pos = pathtoken.rfind('/')
        sub_dir = pathtoken[:last_slash_pos]
        filename = pathtoken[last_slash_pos+1:]
        file_last_dot_pos = filename.rfind('.')
        if file_last_dot_pos != -1:
            file_extension = filename[file_last_dot_pos+1:]
            filename = filename[:file_last_dot_pos]
        else:
            file_extension = "" 
    elif slash_pos == 0:    # line = "/fsdfsdfsdfsdfsd"
        primarydomain = line[1:]
        pathtoken = ""
        argument = ""
        sub_dir = ""
        filename = ""
        file_extension = ""
    elif slash_pos == len(line)-1:   # line = "fsdfsdfsdfsdfsd/"
        primarydomain = line[:-1]
        pathtoken = ""
        argument = ""
        sub_dir = ""     
        filename = ""
        file_extension = ""
    else:      # line = "fsdfsdfsdfsdfsd"
        primarydomain = line
        pathtoken = ""
        argument = ""
        sub_dir = "" 
        filename = ""
        file_extension = ""
    if part == 'pd':
        return primarydomain
    elif part == 'path':
        return pathtoken
    elif part == 'argument': 
        return argument 
    elif part == 'sub_dir': 
        return sub_dir 
    elif part == 'filename': 
        return filename 
    elif part == 'fe': 
        return file_extension
    elif part == 'others': 
        if len(argument) > 0: 
            return pathtoken + '?' +  argument 
        else: 
            return pathtoken 
    else:
        return primarydomain, pathtoken, argument, sub_dir, filename, file_extension