java.text.Normalizer.Form Java Examples
The following examples show how to use
java.text.Normalizer.Form.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Users.java From packagedrone with Eclipse Public License 1.0 | 7 votes |
public static String hashIt ( final String salt, String data ) { data = Normalizer.normalize ( data, Form.NFC ); final byte[] strData = data.getBytes ( StandardCharsets.UTF_8 ); final byte[] saltData = salt.getBytes ( StandardCharsets.UTF_8 ); final byte[] first = new byte[saltData.length + strData.length]; System.arraycopy ( saltData, 0, first, 0, saltData.length ); System.arraycopy ( strData, 0, first, saltData.length, strData.length ); final MessageDigest md = createDigest (); byte[] digest = md.digest ( first ); final byte[] current = new byte[saltData.length + digest.length]; for ( int i = 0; i < 1000; i++ ) { System.arraycopy ( saltData, 0, current, 0, saltData.length ); System.arraycopy ( digest, 0, current, saltData.length, digest.length ); digest = md.digest ( current ); } return Base64.getEncoder ().encodeToString ( digest ); }
Example #2
Source File: PicaEncoder.java From metafacture-core with Apache License 2.0 | 6 votes |
@Override public void literal(final String name, final String value) { //A Subfield has one character or digit exactly. if (name.length() != 1) { throw new FormatException(name); } if (!entityOpen) { throw new FormatException(name); //new exceptions definition for literal out of entity } final String valueNew = Normalizer.normalize(value, Form.NFD); if (idnControlSubField) { // it is a 003@ field, the same record id delivered with record should follow if (!this.id.equals(value)) { throw new MissingIdException(value); } idnControlSubField = false; //only one record Id will be checked. } builder.append(SUB_DELIMITER); builder.append(name); builder.append(valueNew); }
Example #3
Source File: TextSection.java From testarea-pdfbox2 with Apache License 2.0 | 6 votes |
String toString(List<List<TextPosition>> words) { StringBuilder stringBuilder = new StringBuilder(); boolean first = true; for (List<TextPosition> word : words) { if (first) first = false; else stringBuilder.append(' '); for (TextPosition textPosition : word) { stringBuilder.append(textPosition.getUnicode()); } } // cf. http://stackoverflow.com/a/7171932/1729265 return Normalizer.normalize(stringBuilder, Form.NFKC); }
Example #4
Source File: StringLibExt.java From CloverETL-Engine with GNU Lesser General Public License v2.1 | 6 votes |
/** * <p>Determine if input string is Unicode normalized according to the given form.</p> * * <p>Following normalization forms are supported: * <ul> * <li>NFD: canonical Unicode decomposition</li> * <li>NFC: canonical Unicode decomposition followed by canonical composition</li> * <li>NFKD: compatibility decomposition</li> * <li>NFKC: compatibility decomposition followed by canonical composition</li> * </ul> * </p> * <p>Function gracefully handles null input - null is simply passed through.</p> * * @param context function call context. * @param input input string to normalize. May be null. * @param form specifies algorithm to use. Algorithm name is case insensitive. Cannot be null. * * @return true if input is normalized with respect to the selected form of if input is null. False is returned otherwise. * * @see Normalizer#isNormalized(CharSequence, Form) */ @TLFunctionAnnotation("Determine if given string is Unicode normalized.") @CTL2FunctionDeclaration(impl = IsUnicodeNormalizedFunction.class) public static final boolean isUnicodeNormalized(TLFunctionCallContext context, String input, String form) { if (form == null) { throw new NullPointerException("Null form is not allowed."); } Form normalizerForm; try { normalizerForm = Form.valueOf(form.toUpperCase()); } catch (IllegalArgumentException iae) { throw new IllegalArgumentException("Unsupported normalization form '" + form + "'.", iae); } if (input == null) { return true; } return Normalizer.isNormalized(input, normalizerForm); }
Example #5
Source File: StringLibExt.java From CloverETL-Engine with GNU Lesser General Public License v2.1 | 6 votes |
/** * <p>Normalize input using specified normalization form.</p> * * <p>Following normalization forms are supported: * <ul> * <li>NFD: canonical Unicode decomposition</li> * <li>NFC: canonical Unicode decomposition followed by canonical composition</li> * <li>NFKD: compatibility decomposition</li> * <li>NFKC: compatibility decomposition followed by canonical composition</li> * </ul> * </p> * <p>Function gracefully handles null input - null is simply passed through.</p> * * @param context function call context. * @param input input string to normalize. May be null. * @param form specifies algorithm to use. Algorithm name is case insensitive. Cannot be null. * * @return normalized input string or null if input is also null. * * @see Normalizer#normalize(CharSequence, Form) */ @TLFunctionAnnotation("Perform Unicode normalization of given string.") @CTL2FunctionDeclaration(impl = UnicodeNormalizeFunction.class) public static final String unicodeNormalize(TLFunctionCallContext context, String input, String form) { if (form == null) { throw new NullPointerException("Null form is not allowed."); } Form normalizerForm; try { normalizerForm = Form.valueOf(form.toUpperCase()); } catch (IllegalArgumentException iae) { throw new IllegalArgumentException("Unsupported normalization form '" + form + "'.", iae); } if (input == null) { return null; } return Normalizer.normalize(input, normalizerForm); }
Example #6
Source File: SlugifyUtils.java From voj with GNU General Public License v3.0 | 6 votes |
/** * 获取字符串的Slug. * @param str - 待获取Slug的字符串 * @return 字符串对应的Slug */ public static String getSlug(String str) { if ( str == null ) { return ""; } // Rid of White Spaces String noWhiteSpace = WHITESPACE.matcher(str.trim()).replaceAll("-"); // Processing Non-ASCII Characters try { noWhiteSpace = URLEncoder.encode(noWhiteSpace, "UTF-8"); } catch (UnsupportedEncodingException e) { // Never reach here } // Slugify String String normalized = Normalizer.normalize(noWhiteSpace, Form.NFD); return normalized.toLowerCase(); }
Example #7
Source File: CDataTransferer.java From jdk8u60 with GNU General Public License v2.0 | 6 votes |
@Override public Object translateBytes(byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { String charset = getDefaultTextCharset(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8"); } catch (UnsupportedFlavorException cannotHappen) { } } return new URL(new String(bytes, charset)); } if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
Example #8
Source File: MCRUtils.java From mycore with GNU General Public License v3.0 | 6 votes |
private static String getHash(int iterations, byte[] salt, String text, String algorithm) throws NoSuchAlgorithmException { MessageDigest digest; if (--iterations < 0) { iterations = 0; } byte[] data; digest = MessageDigest.getInstance(algorithm); text = Normalizer.normalize(text, Form.NFC); if (salt != null) { digest.update(salt); } data = digest.digest(text.getBytes(StandardCharsets.UTF_8)); for (int i = 0; i < iterations; i++) { data = digest.digest(data); } return toHexString(data); }
Example #9
Source File: MCRTextNormalizer.java From mycore with GNU General Public License v3.0 | 6 votes |
public static String normalizeText(String text) { String normalizedText = text.toLowerCase(Locale.getDefault()); normalizedText = new MCRHyphenNormalizer().normalize(normalizedText).replace("-", " "); //canonical decomposition, remove accents normalizedText = Normalizer.normalize(normalizedText, Form.NFD).replaceAll("\\p{M}", ""); normalizedText = normalizedText.replace("ue", "u") .replace("oe", "o").replace("ae", "a") .replace("ß", "s").replace("ss", "s"); //remove all non-alphabetic characters normalizedText = normalizedText.replaceAll("[^a-z0-9]\\s]", ""); // remove all words with fewer than four characters // normalizedText = normalizedText.replaceAll("\\b.{1,3}\\b", " ").trim(); normalizedText = normalizedText.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation normalizedText = normalizedText.replaceAll("\\s+", " "); // normalize whitespace return normalizedText; }
Example #10
Source File: CDataTransferer.java From jdk8u-dev-jdk with GNU General Public License v2.0 | 6 votes |
@Override public Object translateBytes(byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { String charset = getDefaultTextCharset(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8"); } catch (UnsupportedFlavorException cannotHappen) { } } return new URL(new String(bytes, charset)); } if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
Example #11
Source File: CDataTransferer.java From jdk8u-jdk with GNU General Public License v2.0 | 6 votes |
@Override public Object translateBytes(byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { String charset = getDefaultTextCharset(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8"); } catch (UnsupportedFlavorException cannotHappen) { } } return new URL(new String(bytes, charset)); } if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
Example #12
Source File: WiktionaryPage.java From dkpro-jwktl with Apache License 2.0 | 5 votes |
/** Static helper method for normalizing the title. That is, the title * is converted into lower case and non-ASCII characters are removed. */ public static String normalizeTitle(final String title) { if (title == null) return null; return Normalizer.normalize(title, Form.NFD) .replaceAll("[^\\p{ASCII}]", "") .toLowerCase(Locale.US); }
Example #13
Source File: SearchUtil.java From FHIR with Apache License 2.0 | 5 votes |
/** * Normalizes a string to be used as a search parameter value. All accents and * diacritics are removed. And then the * string is transformed to lower case. * * @param value * @return */ public static String normalizeForSearch(String value) { String normalizedValue = null; if (value != null) { normalizedValue = Normalizer.normalize(value, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); normalizedValue = normalizedValue.toLowerCase(); } return normalizedValue; }
Example #14
Source File: TagFilter.java From JavaSCR with MIT License | 5 votes |
private static String filterString(String str) { String s = Normalizer.normalize(str, Form.NFKC); // Replaces all noncharacter code points with Unicode U+FFFD s = s.replaceAll("[\\p{Cn}]", "\uFFFD"); // Validate input Pattern pattern = Pattern.compile("<script>"); Matcher matcher = pattern.matcher(s); if (matcher.find()) { throw new IllegalArgumentException("Invalid input"); } return s; }
Example #15
Source File: TechGalleryUtil.java From tech-gallery with Apache License 2.0 | 5 votes |
/** * Method to slugify a name. * * @param name name to be changed. * @return Changed name. */ public static String slugify(String name) { String nowhitespace = WHITESPACE.matcher(name).replaceAll("_"); String normalized = Normalizer.normalize(nowhitespace, Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); return slug.toLowerCase(Locale.ENGLISH); }
Example #16
Source File: StringUtils.java From termsuite-core with Apache License 2.0 | 5 votes |
public static String replaceAccents(String string) { String withoutAccent = Normalizer .normalize(string, Form.NFD) .replaceAll(ASCII_REPLACEMENT, EMPTY_STRING); //FIXME accent removal fails for russian. This is a quick fix if(withoutAccent.isEmpty() && !string.isEmpty()) withoutAccent = string; return withoutAccent; }
Example #17
Source File: ValidateString.java From JavaSCR with MIT License | 5 votes |
private static String NormalizeThenValidate(String input) { // Normalize String s = Normalizer.normalize(input, Form.NFKC); // Validate Pattern pattern = Pattern.compile("[<>]"); // Check for angle brackets Matcher matcher = pattern.matcher(s); if (matcher.find()) { // Found black listed tag throw new IllegalStateException(); } System.out.println("valid input"); return s; }
Example #18
Source File: Function.java From aliada-tool with GNU General Public License v3.0 | 5 votes |
/** * Normalizes a given string as {@link Function#normalize} but also removing all spaces and punctuation. * * @param value the string to be normalized. * @return the normalized string. */ public String normalizeStrong(final String value) { return value == null ? UUID.randomUUID().toString() : uuid(Normalizer.normalize(value, Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", "") .replaceAll("[^A-Za-z0-9]", "")); }
Example #19
Source File: Strings.java From aliada-tool with GNU General Public License v3.0 | 5 votes |
/** * Converts the given value to a string that can be used as local name in URIs. * Basically it will normalize diacritics and replace spaces with underscores. * * @param value the source string. * @return a string that can be used as local name in URIs. */ public static String toURILocalName(final String value) { return value == null ? null : Normalizer.normalize(value, Form.NFD) .replaceAll("\\p{InCombiningDiacriticalMarks}+", "") .replaceAll(" ", "") .replaceAll("©", "") .replaceAll("\\p{Punct}", "") .replaceAll("\\uFFFD", "") .trim(); }
Example #20
Source File: NormalizerBrutal.java From mamute with Apache License 2.0 | 5 votes |
public static String toSlug(String input) { if(input == null || input.isEmpty()) return ""; Pattern NONLATIN = Pattern.compile("[^\\w-]"); Pattern WHITESPACE = Pattern.compile("[\\s]"); String nowhitespace = WHITESPACE.matcher(input).replaceAll("-"); String normalized = Normalizer.normalize(nowhitespace, Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); return slug.toLowerCase(Locale.ENGLISH); }
Example #21
Source File: InputHomogenization.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Returns the normalized text passed in via constructor * @return the normalized text passed in via constructor */ public String transform() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < input.length(); i++) { if (ignoreCharactersContaining != null && ignoreCharactersContaining.contains(String.valueOf(input.charAt(i)))) sb.append(input.charAt(i)); else if (Character.isDigit(input.charAt(i))) sb.append("d"); else if (Character.isUpperCase(input.charAt(i)) && !preserveCase) sb.append(Character.toLowerCase(input.charAt(i))); else sb.append(input.charAt(i)); } String normalized = Normalizer.normalize(sb.toString(), Form.NFD); normalized = normalized.replace(".", ""); normalized = normalized.replace(",", ""); normalized = normalized.replaceAll("\"", ""); normalized = normalized.replace("'", ""); normalized = normalized.replace("(", ""); normalized = normalized.replace(")", ""); normalized = normalized.replace("“", ""); normalized = normalized.replace("”", ""); normalized = normalized.replace("…", ""); normalized = normalized.replace("|", ""); normalized = normalized.replace("/", ""); normalized = normalized.replace("\\", ""); normalized = normalized.replace("[", ""); normalized = normalized.replace("]", ""); normalized = normalized.replace("‘", ""); normalized = normalized.replace("’", ""); normalized = normalized.replaceAll("[!]+", "!"); return normalized; }
Example #22
Source File: Speller.java From morfologik-stemming with BSD 3-Clause "New" or "Revised" License | 5 votes |
private boolean areEqual(final char x, final char y) { if (x == y) { return true; } if (dictionaryMetadata.getEquivalentChars() != null) { List<Character> chars = dictionaryMetadata.getEquivalentChars().get(x); if (chars != null && chars.contains(y)) { return true; } } if (dictionaryMetadata.isIgnoringDiacritics()) { String xn = Normalizer.normalize(Character.toString(x), Form.NFD); String yn = Normalizer.normalize(Character.toString(y), Form.NFD); if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible return true; } if (dictionaryMetadata.isConvertingCase()) { //again case conversion only when needed -- we // do not need String.lowercase because we only check // single characters, so a cheaper method is enough if (Character.isLetter(xn.charAt(0))) { boolean testNeeded = Character.isLowerCase(xn.charAt(0)) != Character.isLowerCase(yn.charAt(0)); if (testNeeded) { return Character.toLowerCase(xn.charAt(0)) == Character.toLowerCase(yn.charAt(0)); } } } return xn.charAt(0) == yn.charAt(0); } return false; }
Example #23
Source File: CDataTransferer.java From openjdk-8 with GNU General Public License v2.0 | 5 votes |
protected Object translateBytesOrStream(InputStream stream, byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { // 5-28-03 VL: [Radar 3266030] // We need to do like Windows does here. if (format == CF_HTML && flavor.isFlavorTextType()) { if (stream == null) { stream = new ByteArrayInputStream(bytes); bytes = null; } stream = new HTMLDecodingInputStream(stream); } if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { if (bytes == null) { bytes = inputStreamToByteArray(stream); stream = null; } String charset = getDefaultTextCharset(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8"); } catch (UnsupportedFlavorException cannotHappen) { } } return new URL(new String(bytes, charset)); } if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
Example #24
Source File: NSString.java From CrossMobile with GNU Lesser General Public License v3.0 | 5 votes |
/** * Compares the given Strings using the specified options. * * @param from The first String to be compared. * @param with The second String to be compared. * @param NSStringCompareOptions The option for searching the Strings. * @return The result of comparing the two Strings. * @see crossmobile.ios.foundation.NSOrdered */ @CMSelector(value = "- (NSComparisonResult)compare:(NSString *)aString options:(NSStringCompareOptions)mask", staticMapping = true) public static int compare(String from, String with, int NSStringCompareOptions) { if (with == null && from == null) return NSOrdered.Same; if (with == null) return NSOrdered.Descending; if (from == null) return NSOrdered.Ascending; int order; if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSNumericSearch) != 0) { double fromD = stringToRelaxedDouble(from); double withD = stringToRelaxedDouble(with); order = fromD == withD ? 0 : (fromD < withD ? -1 : 1); } else { if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSCaseInsensitiveSearch) != 0) { from = from.toLowerCase(); with = with.toLowerCase(); } if ((NSStringCompareOptions & crossmobile.ios.foundation.NSStringCompareOptions.NSDiacriticInsensitiveSearch) != 0) { from = Normalizer.normalize(from, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); with = Normalizer.normalize(with, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); } order = from.compareTo(with); } return order < 0 ? NSOrdered.Ascending : (order > 0 ? NSOrdered.Descending : NSOrdered.Same); }
Example #25
Source File: PersonNameExtractor.java From yago3 with GNU General Public License v3.0 | 5 votes |
private void writeNormalized(String entity, String name, String source) throws IOException { String normalizedName = Normalizer.normalize(name, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); if (!normalizedName.equals(name)) { write(PERSONNAMEHEURISTICS, new Fact(entity, RDFS.label, FactComponent.forStringWithLanguage(normalizedName, "eng")), PERSONNAMESOURCES, source, "PersonNameExtractor_normalized"); } }
Example #26
Source File: JabArchivesRipper.java From ripme with MIT License | 5 votes |
protected String getSlug(String input) { // Get a URL/file-safe version of a string String nowhitespace = WHITESPACE.matcher(input).replaceAll("-"); String normalized = Normalizer.normalize(nowhitespace, Form.NFD); String slug = NONLATIN.matcher(normalized).replaceAll(""); return slug.toLowerCase(Locale.ENGLISH); }
Example #27
Source File: CDataTransferer.java From openjdk-jdk9 with GNU General Public License v2.0 | 5 votes |
@Override public Object translateBytes(byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { String charset = Charset.defaultCharset().name(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[]) transferable.getTransferData(javaTextEncodingFlavor), StandardCharsets.UTF_8); } catch (UnsupportedFlavorException cannotHappen) { } } String xml = new String(bytes, charset); // macosx pasteboard returns a property list that consists of one URL // let's extract it. return new URL(extractURL(xml)); } if(isUriListFlavor(flavor) && format == CF_FILE) { // dragQueryFile works fine with files and url, // it parses and extracts values from property list. // maxosx always returns property list for // CF_URL and CF_FILE String[] strings = dragQueryFile(bytes); if(strings == null) { return null; } bytes = String.join(System.getProperty("line.separator"), strings).getBytes(); // now we extracted uri from xml, now we should treat it as // regular string that allows to translate data to target represantation // class by base method format = CF_STRING; } else if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
Example #28
Source File: Slug.java From sunbird-lms-service with MIT License | 5 votes |
public static String makeSlug(String input, boolean transliterate) { String origInput = input; String tempInputValue = ""; // Validate the input if (input == null) { ProjectLogger.log("Provided input value is null"); return input; } // Remove extra spaces tempInputValue = input.trim(); // Remove URL encoding tempInputValue = urlDecode(tempInputValue); // If transliterate is required if (transliterate) { // Tranlisterate & cleanup String transliterated = transliterate(tempInputValue); tempInputValue = transliterated; } // Replace all whitespace with dashes tempInputValue = WHITESPACE.matcher(tempInputValue).replaceAll("-"); // Remove all accent chars tempInputValue = Normalizer.normalize(tempInputValue, Form.NFD); // Remove all non-latin special characters tempInputValue = NONLATIN.matcher(tempInputValue).replaceAll(""); // Remove any consecutive dashes tempInputValue = normalizeDashes(tempInputValue); // Validate before returning validateResult(tempInputValue, origInput); // Slug is always lowercase return tempInputValue.toLowerCase(Locale.ENGLISH); }
Example #29
Source File: CDataTransferer.java From openjdk-8-source with GNU General Public License v2.0 | 5 votes |
protected Object translateBytesOrStream(InputStream stream, byte[] bytes, DataFlavor flavor, long format, Transferable transferable) throws IOException { // 5-28-03 VL: [Radar 3266030] // We need to do like Windows does here. if (format == CF_HTML && flavor.isFlavorTextType()) { if (stream == null) { stream = new ByteArrayInputStream(bytes); bytes = null; } stream = new HTMLDecodingInputStream(stream); } if (format == CF_URL && URL.class.equals(flavor.getRepresentationClass())) { if (bytes == null) { bytes = inputStreamToByteArray(stream); stream = null; } String charset = getDefaultTextCharset(); if (transferable != null && transferable.isDataFlavorSupported(javaTextEncodingFlavor)) { try { charset = new String((byte[])transferable.getTransferData(javaTextEncodingFlavor), "UTF-8"); } catch (UnsupportedFlavorException cannotHappen) { } } return new URL(new String(bytes, charset)); } if (format == CF_STRING) { bytes = Normalizer.normalize(new String(bytes, "UTF8"), Form.NFC).getBytes("UTF8"); } return super.translateBytes(bytes, flavor, format, transferable); }
Example #30
Source File: MCRNameMerger.java From mycore with GNU General Public License v3.0 | 5 votes |
private String normalize(String nameFragment) { String text = nameFragment.toLowerCase(Locale.getDefault()); text = new MCRHyphenNormalizer().normalize(text).replace("-", " "); // canonical decomposition, then remove accents text = Normalizer.normalize(text, Form.NFD).replaceAll("\\p{M}", ""); text = text.replace("ue", "u").replace("oe", "o").replace("ae", "a").replace("ß", "s").replace("ss", "s"); text = text.replaceAll("[^a-z0-9]\\s]", ""); //remove all non-alphabetic characters text = text.replaceAll("\\p{Punct}", " ").trim(); // remove all punctuation text = text.replaceAll("\\s+", " "); // normalize whitespace return text.trim(); }