java.text.Normalizer.Form Java Exaples

Source File: Users.java From packagedrone with Eclipse Public License 1.0

7 votes

public static String hashIt ( final String salt, String data )
{
    data = Normalizer.normalize ( data, Form.NFC );

    final byte[] strData = data.getBytes ( StandardCharsets.UTF_8 );
    final byte[] saltData = salt.getBytes ( StandardCharsets.UTF_8 );

    final byte[] first = new byte[saltData.length + strData.length];
    System.arraycopy ( saltData, 0, first, 0, saltData.length );
    System.arraycopy ( strData, 0, first, saltData.length, strData.length );

    final MessageDigest md = createDigest ();

    byte[] digest = md.digest ( first );
    final byte[] current = new byte[saltData.length + digest.length];

    for ( int i = 0; i < 1000; i++ )
    {
        System.arraycopy ( saltData, 0, current, 0, saltData.length );
        System.arraycopy ( digest, 0, current, saltData.length, digest.length );

        digest = md.digest ( current );
    }

    return Base64.getEncoder ().encodeToString ( digest );
}

Source File: PicaEncoder.java From metafacture-core with Apache License 2.0

6 votes

@Override
    public void literal(final String name, final String value) {
        //A Subfield has one character or digit exactly.
        if (name.length() != 1) {
            throw new FormatException(name);
        }
        if (!entityOpen) {
            throw new FormatException(name); //new exceptions definition for literal out of entity
        }
        final String valueNew = Normalizer.normalize(value, Form.NFD);
        if (idnControlSubField) {
            // it is a 003@ field, the same record id delivered with record should follow
            if (!this.id.equals(value)) {
                throw new MissingIdException(value);
            }
            idnControlSubField = false; //only one record Id will be checked.
        }
        builder.append(SUB_DELIMITER);
        builder.append(name);
        builder.append(valueNew);
}

Source File: TextSection.java From testarea-pdfbox2 with Apache License 2.0

6 votes

String toString(List<List<TextPosition>> words)
{
    StringBuilder stringBuilder = new StringBuilder();
    boolean first = true;
    for (List<TextPosition> word : words)
    {
        if (first)
            first = false;
        else
            stringBuilder.append(' ');
        for (TextPosition textPosition : word)
        {
            stringBuilder.append(textPosition.getUnicode());
        }
    }
    // cf. http://stackoverflow.com/a/7171932/1729265
    return Normalizer.normalize(stringBuilder, Form.NFKC);
}

Source File: StringLibExt.java From CloverETL-Engine with GNU Lesser General Public License v2.1

6 votes

/**
 * <p>Determine if input string is Unicode normalized according to the given form.</p>
 * 
 * <p>Following normalization forms are supported:
 * <ul>
 *   <li>NFD: canonical Unicode decomposition</li>
 *   <li>NFC: canonical Unicode decomposition followed by canonical composition</li>
 *   <li>NFKD: compatibility decomposition</li>
 *   <li>NFKC: compatibility decomposition followed by canonical composition</li>
 * </ul>
 * </p>
 * <p>Function gracefully handles null input - null is simply passed through.</p>
 * 
 * @param context function call context.
 * @param input input string to normalize. May be null.
 * @param form specifies algorithm to use. Algorithm name is case insensitive. Cannot be null.
 * 
 * @return true if input is normalized with respect to the selected form of if input is null. False is returned otherwise.
 * 
 * @see Normalizer#isNormalized(CharSequence, Form)
 */
@TLFunctionAnnotation("Determine if given string is Unicode normalized.")
@CTL2FunctionDeclaration(impl = IsUnicodeNormalizedFunction.class)
public static final boolean isUnicodeNormalized(TLFunctionCallContext context, String input, String form) {

	if (form == null) {
		throw new NullPointerException("Null form is not allowed.");
	}
	
	Form normalizerForm;
	try {
		normalizerForm = Form.valueOf(form.toUpperCase());
	} catch (IllegalArgumentException iae) {
		throw new IllegalArgumentException("Unsupported normalization form '" + form + "'.", iae);
	}
	
	if (input == null) {
		return true;
	}
	
	return Normalizer.isNormalized(input, normalizerForm);
}

Source File: StringLibExt.java From CloverETL-Engine with GNU Lesser General Public License v2.1

6 votes

/**
 * <p>Normalize input using specified normalization form.</p>
 * 
 * <p>Following normalization forms are supported:
 * <ul>
 *   <li>NFD: canonical Unicode decomposition</li>
 *   <li>NFC: canonical Unicode decomposition followed by canonical composition</li>
 *   <li>NFKD: compatibility decomposition</li>
 *   <li>NFKC: compatibility decomposition followed by canonical composition</li>
 * </ul>
 * </p>
 * <p>Function gracefully handles null input - null is simply passed through.</p>
 * 
 * @param context function call context.
 * @param input input string to normalize. May be null.
 * @param form specifies algorithm to use. Algorithm name is case insensitive. Cannot be null.
 * 
 * @return normalized input string or null if input is also null.
 * 
 * @see Normalizer#normalize(CharSequence, Form)
 */
@TLFunctionAnnotation("Perform Unicode normalization of given string.")
@CTL2FunctionDeclaration(impl = UnicodeNormalizeFunction.class)
public static final String unicodeNormalize(TLFunctionCallContext context, String input, String form) {
	
	if (form == null) {
		throw new NullPointerException("Null form is not allowed.");
	}
	
	Form normalizerForm;
	try {
		normalizerForm = Form.valueOf(form.toUpperCase());
	} catch (IllegalArgumentException iae) {
		throw new IllegalArgumentException("Unsupported normalization form '" + form + "'.", iae);
	}
	
	if (input == null) {
		return null;
	}

	return Normalizer.normalize(input, normalizerForm);
}

Source File: SlugifyUtils.java From voj with GNU General Public License v3.0

6 votes

/**
 * 获取字符串的Slug.
 * @param str - 待获取Slug的字符串
 * @return 字符串对应的Slug
 */
public static String getSlug(String str) {
	if ( str == null ) {
		return "";
	}
	
	// Rid of White Spaces
	String noWhiteSpace = WHITESPACE.matcher(str.trim()).replaceAll("-");
	// Processing Non-ASCII Characters
	try {
		noWhiteSpace = URLEncoder.encode(noWhiteSpace, "UTF-8");
	} catch (UnsupportedEncodingException e) {
		// Never reach here
	}
	// Slugify String
	String normalized = Normalizer.normalize(noWhiteSpace, Form.NFD);
	
	return normalized.toLowerCase();
}

Source File: CDataTransferer.java From jdk8u60 with GNU General Public License v2.0

6 votes

@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
                                long format, Transferable transferable) throws IOException {

        if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
        {
            String charset = getDefaultTextCharset();
            if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
                try {
                    charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
                } catch (UnsupportedFlavorException cannotHappen) {
                }
            }

            return new URL(new String(bytes, charset));
        }

        if (format == CF_STRING) {
            bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
        }

        return super.translateBytes(bytes, flavor, format, transferable);
}

Source File: MCRUtils.java From mycore with GNU General Public License v3.0

6 votes

private static String getHash(int iterations, byte[] salt, String text, String algorithm)
    throws NoSuchAlgorithmException {
    MessageDigest digest;
    if (--iterations < 0) {
        iterations = 0;
    }
    byte[] data;
    digest = MessageDigest.getInstance(algorithm);
    text = Normalizer.normalize(text, Form.NFC);
    if (salt != null) {
        digest.update(salt);
    }
    data = digest.digest(text.getBytes(StandardCharsets.UTF_8));
    for (int i = 0; i < iterations; i++) {
        data = digest.digest(data);
    }
    return toHexString(data);
}

Source File: MCRTextNormalizer.java From mycore with GNU General Public License v3.0

6 votes

public static String normalizeText(String text) {
    String normalizedText = text.toLowerCase(Locale.getDefault());
    normalizedText = new MCRHyphenNormalizer().normalize(normalizedText).replace("-", " ");
    //canonical decomposition, remove accents
    normalizedText = Normalizer.normalize(normalizedText, Form.NFD).replaceAll("\\p{M}", "");
    normalizedText = normalizedText.replace("ue", "u")
        .replace("oe", "o").replace("ae", "a")
        .replace("ß", "s").replace("ss", "s");
    //remove all non-alphabetic characters
    normalizedText = normalizedText.replaceAll("[^a-z0-9]\\s]", "");
    // remove all words with fewer than four characters
    // normalizedText = normalizedText.replaceAll("\\b.{1,3}\\b", " ").trim();
    normalizedText = normalizedText.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation
    normalizedText = normalizedText.replaceAll("\\s+", " "); // normalize whitespace
    return normalizedText;
}

Source File: CDataTransferer.java From jdk8u-dev-jdk with GNU General Public License v2.0

6 votes

@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
                                long format, Transferable transferable) throws IOException {

        if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
        {
            String charset = getDefaultTextCharset();
            if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
                try {
                    charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
                } catch (UnsupportedFlavorException cannotHappen) {
                }
            }

            return new URL(new String(bytes, charset));
        }

        if (format == CF_STRING) {
            bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
        }

        return super.translateBytes(bytes, flavor, format, transferable);
}

Source File: CDataTransferer.java From jdk8u-jdk with GNU General Public License v2.0

6 votes

@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
                                long format, Transferable transferable) throws IOException {

        if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
        {
            String charset = getDefaultTextCharset();
            if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
                try {
                    charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
                } catch (UnsupportedFlavorException cannotHappen) {
                }
            }

            return new URL(new String(bytes, charset));
        }

        if (format == CF_STRING) {
            bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
        }

        return super.translateBytes(bytes, flavor, format, transferable);
}

Source File: WiktionaryPage.java From dkpro-jwktl with Apache License 2.0

5 votes

/** Static helper method for normalizing the title. That is, the title
 *  is converted into lower case and non-ASCII characters are removed. */
public static String normalizeTitle(final String title) {
	if (title == null)
		return null;
	
	return Normalizer.normalize(title, Form.NFD)
			.replaceAll("[^\\p{ASCII}]", "")
			.toLowerCase(Locale.US);
}

Source File: SearchUtil.java From FHIR with Apache License 2.0

5 votes

/**
 * Normalizes a string to be used as a search parameter value. All accents and
 * diacritics are removed. And then the
 * string is transformed to lower case.
 *
 * @param value
 * @return
 */
public static String normalizeForSearch(String value) {

    String normalizedValue = null;
    if (value != null) {
        normalizedValue = Normalizer.normalize(value, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
        normalizedValue = normalizedValue.toLowerCase();
    }

    return normalizedValue;
}

Source File: TagFilter.java From JavaSCR with MIT License

5 votes

private static String filterString(String str) {
	String s = Normalizer.normalize(str, Form.NFKC);

	// Replaces all noncharacter code points with Unicode U+FFFD
	s = s.replaceAll("[\\p{Cn}]", "\uFFFD");

	// Validate input
	Pattern pattern = Pattern.compile("<script>");
	Matcher matcher = pattern.matcher(s);
	if (matcher.find()) {
		throw new IllegalArgumentException("Invalid input");
	}
	return s;
}

Source File: TechGalleryUtil.java From tech-gallery with Apache License 2.0

5 votes

/**
 * Method to slugify a name.
 *
 * @param name name to be changed.
 * @return Changed name.
 */
public static String slugify(String name) {
  String nowhitespace = WHITESPACE.matcher(name).replaceAll("_");
  String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
  String slug = NONLATIN.matcher(normalized).replaceAll("");
  return slug.toLowerCase(Locale.ENGLISH);
}

Source File: StringUtils.java From termsuite-core with Apache License 2.0

5 votes

public static String replaceAccents(String string) {
	String withoutAccent = Normalizer
			.normalize(string, Form.NFD)
			.replaceAll(ASCII_REPLACEMENT, EMPTY_STRING);

	//FIXME accent removal fails for russian. This is a quick fix
	if(withoutAccent.isEmpty() && !string.isEmpty()) 
		withoutAccent = string;
	
	return withoutAccent;
}

Source File: ValidateString.java From JavaSCR with MIT License

5 votes

private static String NormalizeThenValidate(String input) {
	// Normalize
	String s = Normalizer.normalize(input, Form.NFKC);

	// Validate
	Pattern pattern = Pattern.compile("[<>]"); // Check for angle brackets
	Matcher matcher = pattern.matcher(s);
	if (matcher.find()) {
		// Found black listed tag
		throw new IllegalStateException();
	}
	System.out.println("valid input");

	return s;
}

Source File: Function.java From aliada-tool with GNU General Public License v3.0

5 votes

/**
 * Normalizes a given string as {@link Function#normalize} but also removing all spaces and punctuation.
 * 
 * @param value the string to be normalized.
 * @return the normalized string.
 */
public String normalizeStrong(final String value) {
	   return value == null ? UUID.randomUUID().toString()
		        : uuid(Normalizer.normalize(value, Form.NFD)
		            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "")
		            .replaceAll("[^A-Za-z0-9]", ""));
}

Source File: Strings.java From aliada-tool with GNU General Public License v3.0

5 votes

/**
 * Converts the given value to a string that can be used as local name in URIs.
 * Basically it will normalize diacritics and replace spaces with underscores.
 * 
 * @param value the source string.
 * @return a string that can be used as local name in URIs.
 */
public static String toURILocalName(final String value) {
	   return value == null ? null
		        : Normalizer.normalize(value, Form.NFD)
		            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "")
		            .replaceAll(" ", "")
		            .replaceAll("©", "")
		            .replaceAll("\\p{Punct}", "")
		            .replaceAll("\\uFFFD", "")
		            .trim();
}

Source File: NormalizerBrutal.java From mamute with Apache License 2.0

5 votes

public static String toSlug(String input) {
	if(input == null || input.isEmpty()) return "";
	Pattern NONLATIN = Pattern.compile("[^\\w-]");
	Pattern WHITESPACE = Pattern.compile("[\\s]");
	String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
	String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
	String slug = NONLATIN.matcher(normalized).replaceAll("");
	return slug.toLowerCase(Locale.ENGLISH);
}

Source File: InputHomogenization.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Returns the normalized text passed in via constructor
 * @return the normalized text passed in via constructor
 */
public String transform() {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < input.length(); i++) {
        if (ignoreCharactersContaining != null
                        && ignoreCharactersContaining.contains(String.valueOf(input.charAt(i))))
            sb.append(input.charAt(i));
        else if (Character.isDigit(input.charAt(i)))
            sb.append("d");
        else if (Character.isUpperCase(input.charAt(i)) && !preserveCase)
            sb.append(Character.toLowerCase(input.charAt(i)));
        else
            sb.append(input.charAt(i));

    }

    String normalized = Normalizer.normalize(sb.toString(), Form.NFD);
    normalized = normalized.replace(".", "");
    normalized = normalized.replace(",", "");
    normalized = normalized.replaceAll("\"", "");
    normalized = normalized.replace("'", "");
    normalized = normalized.replace("(", "");
    normalized = normalized.replace(")", "");
    normalized = normalized.replace("“", "");
    normalized = normalized.replace("”", "");
    normalized = normalized.replace("…", "");
    normalized = normalized.replace("|", "");
    normalized = normalized.replace("/", "");
    normalized = normalized.replace("\\", "");
    normalized = normalized.replace("[", "");
    normalized = normalized.replace("]", "");
    normalized = normalized.replace("‘", "");
    normalized = normalized.replace("’", "");
    normalized = normalized.replaceAll("[!]+", "!");
    return normalized;
}

Source File: Speller.java From morfologik-stemming with BSD 3-Clause "New" or "Revised" License

5 votes

private boolean areEqual(final char x, final char y) {
  if (x == y) {
    return true;
  }
  if (dictionaryMetadata.getEquivalentChars() != null) {
    List<Character> chars = dictionaryMetadata.getEquivalentChars().get(x);
    if (chars != null && chars.contains(y)) {
      return true;
    }
  }
  if (dictionaryMetadata.isIgnoringDiacritics()) {
    String xn = Normalizer.normalize(Character.toString(x), Form.NFD);
    String yn = Normalizer.normalize(Character.toString(y), Form.NFD);
    if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible
      return true;
    }
    if (dictionaryMetadata.isConvertingCase()) {
      //again case conversion only when needed -- we
      // do not need String.lowercase because we only check
      // single characters, so a cheaper method is enough
      if (Character.isLetter(xn.charAt(0))) {
        boolean testNeeded = Character.isLowerCase(xn.charAt(0)) != Character.isLowerCase(yn.charAt(0));
        if (testNeeded) {
          return Character.toLowerCase(xn.charAt(0)) == Character.toLowerCase(yn.charAt(0));
        }
      }
    }
    return xn.charAt(0) == yn.charAt(0);
  }
  return false;
}

Source File: CDataTransferer.java From openjdk-8 with GNU General Public License v2.0

5 votes

protected Object translateBytesOrStream(InputStream stream, byte[] bytes, DataFlavor flavor, long format,
                                    Transferable transferable) throws IOException
{
    // 5-28-03 VL: [Radar 3266030]
    // We need to do like Windows does here.
    if (format == CF_HTML && flavor.isFlavorTextType()) {
        if (stream == null) {
            stream = new ByteArrayInputStream(bytes);
            bytes = null;
        }

        stream = new HTMLDecodingInputStream(stream);
    }

    if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
    {
        if (bytes == null) {
            bytes = inputStreamToByteArray(stream);
            stream = null;
        }

        String charset = getDefaultTextCharset();
        if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
            try {
                charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
            } catch (UnsupportedFlavorException cannotHappen) {
            }
        }

        return new URL(new String(bytes, charset));
    }

    if (format == CF_STRING) {
        bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
    }

    return super.translateBytes(bytes, flavor, format, transferable);
}

Source File: NSString.java From CrossMobile with GNU Lesser General Public License v3.0

5 votes

/**
 * Compares the given Strings using the specified options.
 *
 * @param from                   The first String to be compared.
 * @param with                   The second String to be compared.
 * @param NSStringCompareOptions The option for searching the Strings.
 * @return The result of comparing the two Strings.
 * @see crossmobile.ios.foundation.NSOrdered
 */
@CMSelector(value = "- (NSComparisonResult)compare:(NSString *)aString options:(NSStringCompareOptions)mask", staticMapping = true)
public static int compare(String from, String with, int NSStringCompareOptions) {
    if (with == null && from == null)
        return NSOrdered.Same;
    if (with == null)
        return NSOrdered.Descending;
    if (from == null)
        return NSOrdered.Ascending;
    int order;

    if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSNumericSearch) != 0) {
        double fromD = stringToRelaxedDouble(from);
        double withD = stringToRelaxedDouble(with);
        order = fromD == withD ? 0 : (fromD < withD ? -1 : 1);
    } else {
        if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSCaseInsensitiveSearch) != 0) {
            from = from.toLowerCase();
            with = with.toLowerCase();
        }
        if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSDiacriticInsensitiveSearch) != 0) {
            from = Normalizer.normalize(from, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
            with = Normalizer.normalize(with, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
        }
        order = from.compareTo(with);
    }
    return order < 0 ? NSOrdered.Ascending : (order > 0 ? NSOrdered.Descending : NSOrdered.Same);
}

Source File: PersonNameExtractor.java From yago3 with GNU General Public License v3.0

5 votes

private void writeNormalized(String entity, String name, String source) throws IOException {
  String normalizedName = Normalizer.normalize(name, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
  if (!normalizedName.equals(name)) {
    write(PERSONNAMEHEURISTICS, new Fact(entity, RDFS.label, FactComponent.forStringWithLanguage(normalizedName, "eng")), PERSONNAMESOURCES, source,
        "PersonNameExtractor_normalized");
  }
}

Source File: JabArchivesRipper.java From ripme with MIT License

5 votes

protected String getSlug(String input) {
    // Get a URL/file-safe version of a string
    String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
    String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
    String slug = NONLATIN.matcher(normalized).replaceAll("");
    return slug.toLowerCase(Locale.ENGLISH);
}

Source File: CDataTransferer.java From openjdk-jdk9 with GNU General Public License v2.0

5 votes

@Override
public Object translateBytes(byte[] bytes, DataFlavor flavor,
                             long format, Transferable transferable) throws IOException {

    if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) {
        String charset = Charset.defaultCharset().name();
        if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
            try {
                charset = new String((byte[]) transferable.getTransferData(javaTextEncodingFlavor), StandardCharsets.UTF_8);
            } catch (UnsupportedFlavorException cannotHappen) {
            }
        }

        String xml = new String(bytes, charset);
        // macosx pasteboard returns a property list that consists of one URL
        // let's extract it.
        return new URL(extractURL(xml));
    }

    if(isUriListFlavor(flavor) && format == CF_FILE) {
        // dragQueryFile works fine with files and url,
        // it parses and extracts values from property list.
        // maxosx always returns property list for
        // CF_URL and CF_FILE
        String[] strings = dragQueryFile(bytes);
        if(strings == null) {
            return null;
        }
        bytes = String.join(System.getProperty("line.separator"),
                strings).getBytes();
        // now we extracted uri from xml, now we should treat it as
        // regular string that allows to translate data to target represantation
        // class by base method
        format = CF_STRING;
    } else if (format == CF_STRING) {
        bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
    }

    return super.translateBytes(bytes, flavor, format, transferable);
}

Source File: Slug.java From sunbird-lms-service with MIT License

5 votes

public static String makeSlug(String input, boolean transliterate) {
  String origInput = input;
  String tempInputValue = "";
  // Validate the input
  if (input == null) {
    ProjectLogger.log("Provided input value is null");
    return input;
  }
  // Remove extra spaces
  tempInputValue = input.trim();
  // Remove URL encoding
  tempInputValue = urlDecode(tempInputValue);
  // If transliterate is required
  if (transliterate) {
    // Tranlisterate & cleanup
    String transliterated = transliterate(tempInputValue);
    tempInputValue = transliterated;
  }
  // Replace all whitespace with dashes
  tempInputValue = WHITESPACE.matcher(tempInputValue).replaceAll("-");
  // Remove all accent chars
  tempInputValue = Normalizer.normalize(tempInputValue, Form.NFD);
  // Remove all non-latin special characters
  tempInputValue = NONLATIN.matcher(tempInputValue).replaceAll("");
  // Remove any consecutive dashes
  tempInputValue = normalizeDashes(tempInputValue);
  // Validate before returning
  validateResult(tempInputValue, origInput);
  // Slug is always lowercase
  return tempInputValue.toLowerCase(Locale.ENGLISH);
}

Source File: CDataTransferer.java From openjdk-8-source with GNU General Public License v2.0

5 votes

protected Object translateBytesOrStream(InputStream stream, byte[] bytes, DataFlavor flavor, long format,
                                    Transferable transferable) throws IOException
{
    // 5-28-03 VL: [Radar 3266030]
    // We need to do like Windows does here.
    if (format == CF_HTML && flavor.isFlavorTextType()) {
        if (stream == null) {
            stream = new ByteArrayInputStream(bytes);
            bytes = null;
        }

        stream = new HTMLDecodingInputStream(stream);
    }

    if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass()))
    {
        if (bytes == null) {
            bytes = inputStreamToByteArray(stream);
            stream = null;
        }

        String charset = getDefaultTextCharset();
        if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) {
            try {
                charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8");
            } catch (UnsupportedFlavorException cannotHappen) {
            }
        }

        return new URL(new String(bytes, charset));
    }

    if (format == CF_STRING) {
        bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8");
    }

    return super.translateBytes(bytes, flavor, format, transferable);
}

Source File: MCRNameMerger.java From mycore with GNU General Public License v3.0

5 votes

private String normalize(String nameFragment) {
    String text = nameFragment.toLowerCase(Locale.getDefault());
    text = new MCRHyphenNormalizer().normalize(text).replace("-", " ");
    // canonical decomposition, then remove accents
    text = Normalizer.normalize(text, Form.NFD).replaceAll("\\p{M}", "");
    text = text.replace("ue", "u").replace("oe", "o").replace("ae", "a").replace("ß", "s").replace("ss", "s");
    text = text.replaceAll("[^a-z0-9]\\s]", ""); //remove all non-alphabetic characters
    text = text.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation
    text = text.replaceAll("\\s+", " "); // normalize whitespace
    return text.trim();
}

java.text.Normalizer.Form Java Examples