org.apache.commons.codec.net.URLCodec#encodeUrl

Source File: TestURIBuilder.java From davmail with GNU General Public License v2.0

4 votes

public void testEncodeSpecial() {
    BitSet ical_allowed_abs_path = new BitSet(256);

    ical_allowed_abs_path.or(org.apache.commons.httpclient.URI.allowed_abs_path);
    ical_allowed_abs_path.clear('@');

    String path = "[email protected]";
    String encoded = URIUtil.encode(path, ical_allowed_abs_path);

    System.out.println(encoded);

    String newEncoded = new String(URLCodec.encodeUrl(ical_allowed_abs_path, path.getBytes(Consts.UTF_8)), Consts.UTF_8);
    System.out.println(newEncoded);

    assertEquals(newEncoded, encoded);
}

Source File: HttpUriDissector.java From logparser with Apache License 2.0

4 votes

@Override
public void dissect(final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    final ParsedField field = parsable.getParsableField(INPUT_TYPE, inputname);

    String uriString = field.getValue().getString();
    if (uriString == null || uriString.isEmpty()) {
        return; // Nothing to do here
    }

    // First we cleanup the URI so we fail less often over 'garbage' URIs.
    // See: https://stackoverflow.com/questions/11038967/brackets-in-a-request-url-are-legal-but-not-in-a-uri-java
    uriString = new String(URLCodec.encodeUrl(BAD_URI_CHARS, uriString.getBytes(UTF_8)), US_ASCII);

    // Before we hand it to the standard parser we hack it around a bit so we can parse
    // nasty edge cases that are illegal yet do occur in real clickstreams.
    // Also we force the query string to start with ?& so the returned query string starts with &
    // Which leads to more consistent output after parsing.
    int firstQuestionMark = uriString.indexOf('?');
    int firstAmpersand = uriString.indexOf('&');
    // Now we can have one of 3 situations:
    // 1) No query string
    // 2) Query string starts with a '?'
    //      (and optionally followed by one or more '&' or '?' )
    // 3) Query string starts with a '&'. This is invalid but does occur!
    // We may have ?x=x&y=y?z=z so we normalize it always
    // to:  ?&x=x&y=y&z=z
    if (firstAmpersand != -1 || firstQuestionMark != -1) {
        uriString = uriString.replaceAll("\\?", "&");
        uriString = uriString.replaceFirst("&", "?&");
    }

    // We find that people muck up the URL by putting % signs in the URLs that are NOT escape sequences
    // So any % that is not followed by a two 'hex' letters is fixed
    uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");
    uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");

    // We have URIs with fragments like this:
    //    /path/?_requestid=1234#x3D;12341234&Referrer&#x3D;blablabla
    // So first we repair the broken encoded char
    uriString = ALMOST_HTML_ENCODED.matcher(uriString).replaceAll("$1&$2");
    uriString = StringEscapeUtils.unescapeHtml4(uriString);
    // And we see URIs with this:
    //    /path/?Referrer=ADV1234#&f=API&subid=#&name=12341234
    uriString = EQUALS_HASH_PATTERN.matcher(uriString).replaceAll("=");
    uriString = HASH_AMP_PATTERN.matcher(uriString).replaceAll("&");

    // If we still have multiple '#' in here we replace them with something else: '~'
    while (true) {
        Matcher doubleHashMatcher = DOUBLE_HASH_PATTERN.matcher(uriString);
        if (!doubleHashMatcher.find()) {
            break;
        }
        uriString = doubleHashMatcher.replaceAll("~$1#");
    }

    boolean isUrl = true;
    URI uri;
    try {
        if (uriString.charAt(0) == '/') {
            uri = URI.create("dummy-protocol://dummy.host.name" + uriString);
            isUrl = false; // I.e. we do not return the values we just faked.
        } else {
            uri = URI.create(uriString);
        }
    } catch (IllegalArgumentException e) {
        throw new DissectionFailure("Failed to parse URI >>" + field.getValue().getString()+"<< because of : " +e.getMessage());
    }

    if (wantQuery || wantPath || wantRef) {
        if (wantQuery) {
            String query = uri.getRawQuery();
            if (query == null) {
                query = "";
            }
            parsable.addDissection(inputname, "HTTP.QUERYSTRING", "query", query);
        }
        if (wantPath) {
            parsable.addDissection(inputname, "HTTP.PATH", "path", uri.getPath());
        }
        if (wantRef) {
            parsable.addDissection(inputname, "HTTP.REF", "ref", uri.getFragment());
        }
    }

    if (isUrl) {
        if (wantProtocol) {
            parsable.addDissection(inputname, "HTTP.PROTOCOL", "protocol", uri.getScheme());
        }
        if (wantUserinfo) {
            parsable.addDissection(inputname, "HTTP.USERINFO", "userinfo", uri.getUserInfo());
        }
        if (wantHost) {
            parsable.addDissection(inputname, "HTTP.HOST", "host", uri.getHost());
        }
        if (wantPort) {
            if (uri.getPort() != -1) {
                parsable.addDissection(inputname, "HTTP.PORT", "port", uri.getPort());
            }
        }
    }
}

Source File: URIUtil.java From knopflerfish.org with BSD 3-Clause "New" or "Revised" License

3 votes

/**
 * Escape and encode a given string with allowed characters not to be
 * escaped and a given charset.
 *
 * @param unescaped a string
 * @param allowed allowed characters not to be escaped
 * @param charset the charset
 * @return the escaped string
 */
public static String encode(String unescaped, BitSet allowed,
        String charset) throws URIException {
    byte[] rawdata = URLCodec.encodeUrl(allowed, 
        EncodingUtil.getBytes(unescaped, charset));
    return EncodingUtil.getAsciiString(rawdata);
}

Source File: URI.java From knopflerfish.org with BSD 3-Clause "New" or "Revised" License

3 votes

/**
 * Encodes URI string.
 *
 * This is a two mapping, one from original characters to octets, and
 * subsequently a second from octets to URI characters:
 * <p><blockquote><pre>
 *   original character sequence->octet sequence->URI character sequence
 * </pre></blockquote><p>
 *
 * An escaped octet is encoded as a character triplet, consisting of the
 * percent character "%" followed by the two hexadecimal digits
 * representing the octet code. For example, "%20" is the escaped
 * encoding for the US-ASCII space character.
 * <p>
 * Conversion from the local filesystem character set to UTF-8 will
 * normally involve a two step process. First convert the local character
 * set to the UCS; then convert the UCS to UTF-8.
 * The first step in the process can be performed by maintaining a mapping
 * table that includes the local character set code and the corresponding
 * UCS code.
 * The next step is to convert the UCS character code to the UTF-8 encoding.
 * <p>
 * Mapping between vendor codepages can be done in a very similar manner
 * as described above.
 * <p>
 * The only time escape encodings can allowedly be made is when a URI is
 * being created from its component parts.  The escape and validate methods
 * are internally performed within this method.
 *
 * @param original the original character sequence
 * @param allowed those characters that are allowed within a component
 * @param charset the protocol charset
 * @return URI character sequence
 * @throws URIException null component or unsupported character encoding
 */
    
protected static char[] encode(String original, BitSet allowed,
        String charset) throws URIException {
    if (original == null) {
        throw new IllegalArgumentException("Original string may not be null");
    }
    if (allowed == null) {
        throw new IllegalArgumentException("Allowed bitset may not be null");
    }
    byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset));
    return EncodingUtil.getAsciiString(rawdata).toCharArray();
}

Source File: CodecUtil.java From common_gui_tools with Apache License 2.0

3 votes

/**
 * Encode string for URL.
 *
 * @param string  String
 * @param charSet CharSet
 * @return <code>String</code> url string
 * @throws UnsupportedEncodingException unsupported encoding exception
 */
public static String encodeURL(String string, String charSet) throws UnsupportedEncodingException {
    if (string == null) {
        return null;
    }
    return new String(URLCodec.encodeUrl(null, string.getBytes(charSet)), charSet);
}

Source File: URIUtil.java From bintray-client-java with Apache License 2.0

3 votes

/**
 * Escape and encode a given string with allowed characters not to be
 * escaped and a given charset.
 *
 * @param unescaped a string
 * @param allowed   allowed characters not to be escaped
 * @param charset   the charset
 * @return the escaped string
 */
public static String encode(String unescaped, BitSet allowed,
                            String charset) throws HttpException {
    byte[] rawdata = URLCodec.encodeUrl(allowed,
            EncodingUtils.getBytes(unescaped, charset));
    return EncodingUtils.getAsciiString(rawdata);
}

Source File: URI.java From bintray-client-java with Apache License 2.0

3 votes

/**
 * Encodes URI string.
 * <p/>
 * This is a two mapping, one from original characters to octets, and
 * subsequently a second from octets to URI characters:
 * <p><blockquote><pre>
 *   original character sequence->octet sequence->URI character sequence
 * </pre></blockquote><p>
 * <p/>
 * An escaped octet is encoded as a character triplet, consisting of the
 * percent character "%" followed by the two hexadecimal digits
 * representing the octet code. For example, "%20" is the escaped
 * encoding for the US-ASCII space character.
 * <p/>
 * Conversion from the local filesystem character set to UTF-8 will
 * normally involve a two step process. First convert the local character
 * set to the UCS; then convert the UCS to UTF-8.
 * The first step in the process can be performed by maintaining a mapping
 * table that includes the local character set code and the corresponding
 * UCS code.
 * The next step is to convert the UCS character code to the UTF-8 encoding.
 * <p/>
 * Mapping between vendor codepages can be done in a very similar manner
 * as described above.
 * <p/>
 * The only time escape encodings can allowedly be made is when a URI is
 * being created from its component parts.  The escape and validate methods
 * are internally performed within this method.
 *
 * @param original the original character sequence
 * @param allowed  those characters that are allowed within a component
 * @param charset  the protocol charset
 * @return URI character sequence
 * @throws HttpException null component or unsupported character encoding
 */

protected static char[] encode(String original, BitSet allowed,
                               String charset) throws HttpException {
    if (original == null) {
        throw new IllegalArgumentException("Original string may not be null");
    }
    if (allowed == null) {
        throw new IllegalArgumentException("Allowed bitset may not be null");
    }
    byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtils.getBytes(original, charset));
    return EncodingUtils.getAsciiString(rawdata).toCharArray();
}

Source File: UrlUtils.java From htmlunit with Apache License 2.0

2 votes

/**
 * Escapes and encodes the specified string. Based on HttpClient 3.1's <tt>URIUtil.encode()</tt> method.
 *
 * @param unescaped the string to encode
 * @param allowed allowed characters that shouldn't be escaped
 * @param charset the charset to use
 * @return the escaped string
 */
private static String encode(final String unescaped, final BitSet allowed, final Charset charset) {
    final byte[] bytes = unescaped.getBytes(charset);
    final byte[] bytes2 = URLCodec.encodeUrl(allowed, bytes);
    return encodePercentSign(bytes2);
}

Source File: UrlUtils.java From HtmlUnit-Android with Apache License 2.0

2 votes

/**
 * Escapes and encodes the specified string. Based on HttpClient 3.1's <tt>URIUtil.encode()</tt> method.
 *
 * @param unescaped the string to encode
 * @param allowed allowed characters that shouldn't be escaped
 * @param charset the charset to use
 * @return the escaped string
 */
private static String encode(final String unescaped, final BitSet allowed, final Charset charset) {
    final byte[] bytes = unescaped.getBytes(charset);
    final byte[] bytes2 = URLCodec.encodeUrl(allowed, bytes);
    return encodePercentSign(bytes2);
}

Java Code Examples for org.apache.commons.codec.net.URLCodec#encodeUrl()