Java Code Examples for java.lang.Character.UnicodeBlock#LATIN_EXTENDED_ADDITIONAL

The following examples show how to use java.lang.Character.UnicodeBlock#LATIN_EXTENDED_ADDITIONAL . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: NGram.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * Character Normalization
 * @param ch
 * @return Normalized character
 */
static public char normalize(char ch) {
    Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
    if (block == UnicodeBlock.BASIC_LATIN) {
        if (ch<'A' || (ch<'a' && ch >'Z') || ch>'z') ch = ' ';
    } else if (block == UnicodeBlock.LATIN_1_SUPPLEMENT) {
        if (LATIN1_EXCLUDED.indexOf(ch)>=0) ch = ' ';
    } else if (block == UnicodeBlock.LATIN_EXTENDED_B) {
        // normalization for Romanian
        if (ch == '\u0219') ch = '\u015f';  // Small S with comma below => with cedilla
        if (ch == '\u021b') ch = '\u0163';  // Small T with comma below => with cedilla
    } else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
        ch = ' ';
    } else if (block == UnicodeBlock.ARABIC) {
        if (ch == '\u06cc') ch = '\u064a';  // Farsi yeh => Arabic yeh
    } else if (block == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
        if (ch >= '\u1ea0') ch = '\u1ec3';
    } else if (block == UnicodeBlock.HIRAGANA) {
        ch = '\u3042';
    } else if (block == UnicodeBlock.KATAKANA) {
        ch = '\u30a2';
    } else if (block == UnicodeBlock.BOPOMOFO || block == UnicodeBlock.BOPOMOFO_EXTENDED) {
        ch = '\u3105';
    } else if (block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
        if (cjk_map.containsKey(ch)) ch = cjk_map.get(ch);
    } else if (block == UnicodeBlock.HANGUL_SYLLABLES) {
        ch = '\uac00';
    }
    return ch;
}
 
Example 2
Source File: NGram.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public static char normalize(char c) {
    char ch = c;
    Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
    if (block == UnicodeBlock.BASIC_LATIN) {
        if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z') {
            ch = ' ';
        }
    } else if (block == UnicodeBlock.LATIN_1_SUPPLEMENT) {
        if (LATIN1_EXCLUDED.indexOf(ch) >= 0) {
            ch = ' ';
        }
    } else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
        ch = ' ';
    } else if (block == UnicodeBlock.ARABIC) {
        if (ch == '\u06cc') {
            ch = '\u064a';
        }
    } else if (block == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
        if (ch >= '\u1ea0') {
            ch = '\u1ec3';
        }
    } else if (block == UnicodeBlock.HIRAGANA) {
        ch = '\u3042';
    } else if (block == UnicodeBlock.KATAKANA) {
        ch = '\u30a2';
    } else if (block == UnicodeBlock.BOPOMOFO || block == UnicodeBlock.BOPOMOFO_EXTENDED) {
        ch = '\u3105';
    } else if (block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
        if (cjk_map.containsKey(ch)) {
            ch = cjk_map.get(ch);
        }
    } else if (block == UnicodeBlock.HANGUL_SYLLABLES) {
        ch = '\uac00';
    }
    return ch;
}
 
Example 3
Source File: NGram.java    From language-detection with Apache License 2.0 5 votes vote down vote up
/**
 * Character Normalization
 * @param ch
 * @return Normalized character
 */
static public char normalize(char ch) {
    Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
    if (block == UnicodeBlock.BASIC_LATIN) {
        if (ch<'A' || (ch<'a' && ch >'Z') || ch>'z') ch = ' ';
    } else if (block == UnicodeBlock.LATIN_1_SUPPLEMENT) {
        if (LATIN1_EXCLUDED.indexOf(ch)>=0) ch = ' ';
    } else if (block == UnicodeBlock.LATIN_EXTENDED_B) {
        // normalization for Romanian
        if (ch == '\u0219') ch = '\u015f';  // Small S with comma below => with cedilla
        if (ch == '\u021b') ch = '\u0163';  // Small T with comma below => with cedilla
    } else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
        ch = ' ';
    } else if (block == UnicodeBlock.ARABIC) {
        if (ch == '\u06cc') ch = '\u064a';  // Farsi yeh => Arabic yeh
    } else if (block == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
        if (ch >= '\u1ea0') ch = '\u1ec3';
    } else if (block == UnicodeBlock.HIRAGANA) {
        ch = '\u3042';
    } else if (block == UnicodeBlock.KATAKANA) {
        ch = '\u30a2';
    } else if (block == UnicodeBlock.BOPOMOFO || block == UnicodeBlock.BOPOMOFO_EXTENDED) {
        ch = '\u3105';
    } else if (block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
        if (cjk_map.containsKey(ch)) ch = cjk_map.get(ch);
    } else if (block == UnicodeBlock.HANGUL_SYLLABLES) {
        ch = '\uac00';
    }
    return ch;
}