com.ibm.icu.lang.UScript#getScript

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

6 votes

/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private int getScript(int codepoint) {
  if (0 <= codepoint && codepoint < basicLatin.length) {
    return basicLatin[codepoint];
  } else {
    int script = UScript.getScript(codepoint);
    if (combineCJ) {
      if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
        return UScript.JAPANESE;
      } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
        // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
        // they are treated as punctuation. we currently have no cleaner way to fix this!
        return UScript.LATIN; 
      } else {
        return script;
      }
    } else {
      return script;
    }
  }
}

Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

/**
 * fast version of UScript.getScript(). Basic Latin is an array lookup
 */
private int getScript(int codepoint) {
    if (0 <= codepoint && codepoint < basicLatin.length) {
        return basicLatin[codepoint];
    } else {
        int script = UScript.getScript(codepoint);
        if (combineCJ) {
            if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
                return UScript.JAPANESE;
            } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
                // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
                // they are treated as punctuation. we currently have no cleaner way to fix this!
                return UScript.LATIN;
            } else {
                return script;
            }
        } else {
            return script;
        }
    }
}

Source File: MCRLanguageDetector.java From mycore with GNU General Public License v3.0

5 votes

private static void buildScores(String text, Map<Integer, AtomicInteger> scores) {
    try {
        char[] chararray = text.toCharArray();
        for (int i = 0; i < text.length(); i++) {
            int code = UScript.getScript(UCharacter.codePointAt(chararray, i));
            increaseScoreFor(scores, code);
        }
    } catch (Exception ignored) {
    }
}

Source File: AnyTransliterator.java From fitnotifications with Apache License 2.0

4 votes

/**
 * Returns TRUE if there are any more runs.  TRUE is always
 * returned at least once.  Upon return, the caller should
 * examine scriptCode, start, and limit.
 */
public boolean next() {
    int ch;
    int s;

    scriptCode = UScript.INVALID_CODE; // don't know script yet
    start = limit;

    // Are we done?
    if (start == textLimit) {
        return false;
    }

    // Move start back to include adjacent COMMON or INHERITED
    // characters
    while (start > textStart) {
        ch = text.char32At(start - 1); // look back
        s = UScript.getScript(ch);
        if (s == UScript.COMMON || s == UScript.INHERITED) {
            --start;
        } else {
            break;
        }
    }

    // Move limit ahead to include COMMON, INHERITED, and characters
    // of the current script.
    while (limit < textLimit) {
        ch = text.char32At(limit); // look ahead
        s = UScript.getScript(ch);
        if (s != UScript.COMMON && s != UScript.INHERITED) {
            if (scriptCode == UScript.INVALID_CODE) {
                scriptCode = s;
            } else if (s != scriptCode) {
                break;
            }
        }
        ++limit;
    }

    // Return TRUE even if the entire text is COMMON / INHERITED, in
    // which case scriptCode will be UScript.INVALID_CODE.
    return true;
}

Source File: UTS46.java From fitnotifications with Apache License 2.0

4 votes

private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

4 votes

@Override
int getValue(int c) {
    return UScript.getScript(c);
}

Source File: CharScriptsSet.java From jasperreports with GNU Lesser General Public License v3.0

4 votes

public boolean includesCharacter(int codePoint)
{
	if (includedScripts == null && excludedScripts == null)
	{
		return true;
	}
	
	int codeScript = UScript.getScript(codePoint);
	if (codeScript == UScript.UNKNOWN)
	{
		//include by default
		return true;
	}
	
	if (codeScript == UScript.COMMON)
	{
		//COMMON is included unless explicitly excluded
		return !excludedCommon;
	}
	
	if (codeScript == UScript.INHERITED)
	{
		//INHERITED is included unless explicitly excluded
		return !excludedInherited;
	}
	
	if (includedScripts != null && includedScripts.contains(codeScript))
	{
		//the codepoint script is explicitly included
		return true;
	}
	
	if (excludedScripts != null && excludedScripts.contains(codeScript))
	{
		//the codepoint script is explicitly excluded
		return false;
	}
	
	if (includedScripts == null)
	{
		//not excluded
		return true;
	}
	
	for (Integer script : includedScripts)
	{
		if (UScript.hasScript(codePoint, script))
		{
			//included as a secondary/extension script
			return true;
		}
	}
	
	//not included
	return false;
}

Source File: UTS46.java From trekarta with GNU General Public License v3.0

4 votes

private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

4 votes

@Override
int getValue(int c) {
    return UScript.getScript(c);
}

Java Code Examples for com.ibm.icu.lang.UScript#getScript()