Java Code Examples for com.ibm.icu.lang.UScript#getScript()
The following examples show how to use
com.ibm.icu.lang.UScript#getScript() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ScriptIterator.java From lucene-solr with Apache License 2.0 | 6 votes |
/** fast version of UScript.getScript(). Basic Latin is an array lookup */ private int getScript(int codepoint) { if (0 <= codepoint && codepoint < basicLatin.length) { return basicLatin[codepoint]; } else { int script = UScript.getScript(codepoint); if (combineCJ) { if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) { return UScript.JAPANESE; } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) { // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise // they are treated as punctuation. we currently have no cleaner way to fix this! return UScript.LATIN; } else { return script; } } else { return script; } } }
Example 2
Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
/** * fast version of UScript.getScript(). Basic Latin is an array lookup */ private int getScript(int codepoint) { if (0 <= codepoint && codepoint < basicLatin.length) { return basicLatin[codepoint]; } else { int script = UScript.getScript(codepoint); if (combineCJ) { if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) { return UScript.JAPANESE; } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) { // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise // they are treated as punctuation. we currently have no cleaner way to fix this! return UScript.LATIN; } else { return script; } } else { return script; } } }
Example 3
Source File: MCRLanguageDetector.java From mycore with GNU General Public License v3.0 | 5 votes |
private static void buildScores(String text, Map<Integer, AtomicInteger> scores) { try { char[] chararray = text.toCharArray(); for (int i = 0; i < text.length(); i++) { int code = UScript.getScript(UCharacter.codePointAt(chararray, i)); increaseScoreFor(scores, code); } } catch (Exception ignored) { } }
Example 4
Source File: AnyTransliterator.java From fitnotifications with Apache License 2.0 | 4 votes |
/** * Returns TRUE if there are any more runs. TRUE is always * returned at least once. Upon return, the caller should * examine scriptCode, start, and limit. */ public boolean next() { int ch; int s; scriptCode = UScript.INVALID_CODE; // don't know script yet start = limit; // Are we done? if (start == textLimit) { return false; } // Move start back to include adjacent COMMON or INHERITED // characters while (start > textStart) { ch = text.char32At(start - 1); // look back s = UScript.getScript(ch); if (s == UScript.COMMON || s == UScript.INHERITED) { --start; } else { break; } } // Move limit ahead to include COMMON, INHERITED, and characters // of the current script. while (limit < textLimit) { ch = text.char32At(limit); // look ahead s = UScript.getScript(ch); if (s != UScript.COMMON && s != UScript.INHERITED) { if (scriptCode == UScript.INVALID_CODE) { scriptCode = s; } else if (s != scriptCode) { break; } } ++limit; } // Return TRUE even if the entire text is COMMON / INHERITED, in // which case scriptCode will be UScript.INVALID_CODE. return true; }
Example 5
Source File: UTS46.java From fitnotifications with Apache License 2.0 | 4 votes |
private void checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) { int labelEnd=labelStart+labelLength-1; // inclusive int arabicDigits=0; // -1 for 066x, +1 for 06Fx for(int i=labelStart; i<=labelEnd; ++i) { int c=label.charAt(i); if(c<0xb7) { // ASCII fastpath } else if(c<=0x6f9) { if(c==0xb7) { // Appendix A.3. MIDDLE DOT (U+00B7) // Rule Set: // False; // If Before(cp) .eq. U+006C And // After(cp) .eq. U+006C Then True; if(!(labelStart<i && label.charAt(i-1)=='l' && i<labelEnd && label.charAt(i+1)=='l')) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); } } else if(c==0x375) { // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) // Rule Set: // False; // If Script(After(cp)) .eq. Greek Then True; if(!(i<labelEnd && UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); } } else if(c==0x5f3 || c==0x5f4) { // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) // Rule Set: // False; // If Script(Before(cp)) .eq. Hebrew Then True; // // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) // Rule Set: // False; // If Script(Before(cp)) .eq. Hebrew Then True; if(!(labelStart<i && UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); } } else if(0x660<=c /* && c<=0x6f9 */) { // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) // Rule Set: // True; // For All Characters: // If cp .in. 06F0..06F9 Then False; // End For; // // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) // Rule Set: // True; // For All Characters: // If cp .in. 0660..0669 Then False; // End For; if(c<=0x669) { if(arabicDigits>0) { addLabelError(info, Error.CONTEXTO_DIGITS); } arabicDigits=-1; } else if(0x6f0<=c) { if(arabicDigits<0) { addLabelError(info, Error.CONTEXTO_DIGITS); } arabicDigits=1; } } } else if(c==0x30fb) { // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) // Rule Set: // False; // For All Characters: // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; // End For; for(int j=labelStart;; j+=Character.charCount(c)) { if(j>labelEnd) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); break; } c=Character.codePointAt(label, j); int script=UScript.getScript(c); if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { break; } } } } }
Example 6
Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0 | 4 votes |
@Override int getValue(int c) { return UScript.getScript(c); }
Example 7
Source File: CharScriptsSet.java From jasperreports with GNU Lesser General Public License v3.0 | 4 votes |
public boolean includesCharacter(int codePoint) { if (includedScripts == null && excludedScripts == null) { return true; } int codeScript = UScript.getScript(codePoint); if (codeScript == UScript.UNKNOWN) { //include by default return true; } if (codeScript == UScript.COMMON) { //COMMON is included unless explicitly excluded return !excludedCommon; } if (codeScript == UScript.INHERITED) { //INHERITED is included unless explicitly excluded return !excludedInherited; } if (includedScripts != null && includedScripts.contains(codeScript)) { //the codepoint script is explicitly included return true; } if (excludedScripts != null && excludedScripts.contains(codeScript)) { //the codepoint script is explicitly excluded return false; } if (includedScripts == null) { //not excluded return true; } for (Integer script : includedScripts) { if (UScript.hasScript(codePoint, script)) { //included as a secondary/extension script return true; } } //not included return false; }
Example 8
Source File: UTS46.java From trekarta with GNU General Public License v3.0 | 4 votes |
private void checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) { int labelEnd=labelStart+labelLength-1; // inclusive int arabicDigits=0; // -1 for 066x, +1 for 06Fx for(int i=labelStart; i<=labelEnd; ++i) { int c=label.charAt(i); if(c<0xb7) { // ASCII fastpath } else if(c<=0x6f9) { if(c==0xb7) { // Appendix A.3. MIDDLE DOT (U+00B7) // Rule Set: // False; // If Before(cp) .eq. U+006C And // After(cp) .eq. U+006C Then True; if(!(labelStart<i && label.charAt(i-1)=='l' && i<labelEnd && label.charAt(i+1)=='l')) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); } } else if(c==0x375) { // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) // Rule Set: // False; // If Script(After(cp)) .eq. Greek Then True; if(!(i<labelEnd && UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); } } else if(c==0x5f3 || c==0x5f4) { // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) // Rule Set: // False; // If Script(Before(cp)) .eq. Hebrew Then True; // // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) // Rule Set: // False; // If Script(Before(cp)) .eq. Hebrew Then True; if(!(labelStart<i && UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); } } else if(0x660<=c /* && c<=0x6f9 */) { // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) // Rule Set: // True; // For All Characters: // If cp .in. 06F0..06F9 Then False; // End For; // // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) // Rule Set: // True; // For All Characters: // If cp .in. 0660..0669 Then False; // End For; if(c<=0x669) { if(arabicDigits>0) { addLabelError(info, Error.CONTEXTO_DIGITS); } arabicDigits=-1; } else if(0x6f0<=c) { if(arabicDigits<0) { addLabelError(info, Error.CONTEXTO_DIGITS); } arabicDigits=1; } } } else if(c==0x30fb) { // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) // Rule Set: // False; // For All Characters: // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; // End For; for(int j=labelStart;; j+=Character.charCount(c)) { if(j>labelEnd) { addLabelError(info, Error.CONTEXTO_PUNCTUATION); break; } c=Character.codePointAt(label, j); int script=UScript.getScript(c); if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { break; } } } } }
Example 9
Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0 | 4 votes |
@Override int getValue(int c) { return UScript.getScript(c); }