com.ibm.icu.lang.UScript#HIRAGANA

Source File: DefaultICUTokenizerConfig.java From lucene-solr with Apache License 2.0

6 votes

@Override
public String getType(int script, int ruleStatus) {
  switch (ruleStatus) {
    case RuleBasedBreakIterator.WORD_IDEO:
      return WORD_IDEO;
    case RuleBasedBreakIterator.WORD_KANA:
      return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
    case RuleBasedBreakIterator.WORD_LETTER:
      return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
    case RuleBasedBreakIterator.WORD_NUMBER:
      return WORD_NUMBER;
    case EMOJI_SEQUENCE_STATUS:
      return WORD_EMOJI;
    default: /* some other custom code */
      return "<OTHER>";
  }
}

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

6 votes

/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private int getScript(int codepoint) {
  if (0 <= codepoint && codepoint < basicLatin.length) {
    return basicLatin[codepoint];
  } else {
    int script = UScript.getScript(codepoint);
    if (combineCJ) {
      if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
        return UScript.JAPANESE;
      } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
        // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
        // they are treated as punctuation. we currently have no cleaner way to fix this!
        return UScript.LATIN; 
      } else {
        return script;
      }
    } else {
      return script;
    }
  }
}

Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

/**
 * fast version of UScript.getScript(). Basic Latin is an array lookup
 */
private int getScript(int codepoint) {
    if (0 <= codepoint && codepoint < basicLatin.length) {
        return basicLatin[codepoint];
    } else {
        int script = UScript.getScript(codepoint);
        if (combineCJ) {
            if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
                return UScript.JAPANESE;
            } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
                // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
                // they are treated as punctuation. we currently have no cleaner way to fix this!
                return UScript.LATIN;
            } else {
                return script;
            }
        } else {
            return script;
        }
    }
}

Source File: DefaultIcuTokenizerConfig.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

@Override
public String getType(int script, int ruleStatus) {
    switch (ruleStatus) {
        case RuleBasedBreakIterator.WORD_IDEO:
            return WORD_IDEO;
        case RuleBasedBreakIterator.WORD_KANA:
            return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
        case RuleBasedBreakIterator.WORD_LETTER:
            return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
        case RuleBasedBreakIterator.WORD_NUMBER:
            return WORD_NUMBER;
        default: /* some other custom code */
            return "<OTHER>";
    }
}

Source File: RuleBasedBreakIterator.java From fitnotifications with Apache License 2.0

4 votes

private LanguageBreakEngine getLanguageBreakEngine(int c) {

        // We have a dictionary character.
        // Does an already instantiated break engine handle it?
        for (LanguageBreakEngine candidate : fBreakEngines.values()) {
            if (candidate.handles(c, fBreakType)) {
                return candidate;
            }
        }

        // if we don't have an existing engine, build one.
        int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
        if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
            // Katakana, Hiragana and Han are handled by the same dictionary engine.
            // Fold them together for mapping from script -> engine.
            script = UScript.HAN;
        }

        LanguageBreakEngine eng = fBreakEngines.get(script);
        /*
        if (eng != null && !eng.handles(c, fBreakType)) {
            fUnhandledBreakEngine.handleChar(c, getBreakType());
            eng = fUnhandledBreakEngine;
        } else  */  {
            try {
                switch (script) {
                case UScript.THAI:
                    eng = new ThaiBreakEngine();
                    break;
                case UScript.LAO:
                    eng = new LaoBreakEngine();
                    break;
                case UScript.MYANMAR:
                    eng = new BurmeseBreakEngine();
                    break;
                case UScript.KHMER:
                    eng = new KhmerBreakEngine();
                    break;
                case UScript.HAN:
                    if (getBreakType() == KIND_WORD) {
                        eng = new CjkBreakEngine(false);
                    }
                    else {
                        fUnhandledBreakEngine.handleChar(c, getBreakType());
                        eng = fUnhandledBreakEngine;
                    }
                    break;
                case UScript.HANGUL:
                    if (getBreakType() == KIND_WORD) {
                        eng = new CjkBreakEngine(true);
                    } else {
                        fUnhandledBreakEngine.handleChar(c, getBreakType());
                        eng = fUnhandledBreakEngine;
                    }
                    break;
                default:
                    fUnhandledBreakEngine.handleChar(c, getBreakType());
                    eng = fUnhandledBreakEngine;
                    break;
                }
            } catch (IOException e) {
                eng = null;
            }
        }

        if (eng != null && eng != fUnhandledBreakEngine) {
            LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
            if (existingEngine != null) {
                // There was a race & another thread was first to register an engine for this script.
                // Use theirs and discard the one we just created.
                eng = existingEngine;
            }
            // assert eng.handles(c, fBreakType);
        }
        return eng;
    }

Source File: UTS46.java From fitnotifications with Apache License 2.0

4 votes

private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}

Source File: UTS46.java From trekarta with GNU General Public License v3.0

4 votes

private void
checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    int labelEnd=labelStart+labelLength-1;  // inclusive
    int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    for(int i=labelStart; i<=labelEnd; ++i) {
        int c=label.charAt(i);
        if(c<0xb7) {
            // ASCII fastpath
        } else if(c<=0x6f9) {
            if(c==0xb7) {
                // Appendix A.3. MIDDLE DOT (U+00B7)
                // Rule Set:
                //  False;
                //  If Before(cp) .eq.  U+006C And
                //     After(cp) .eq.  U+006C Then True;
                if(!(labelStart<i && label.charAt(i-1)=='l' &&
                     i<labelEnd && label.charAt(i+1)=='l')) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x375) {
                // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                // Rule Set:
                //  False;
                //  If Script(After(cp)) .eq.  Greek Then True;
                if(!(i<labelEnd &&
                     UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(c==0x5f3 || c==0x5f4) {
                // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                //
                // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                // Rule Set:
                //  False;
                //  If Script(Before(cp)) .eq.  Hebrew Then True;
                if(!(labelStart<i &&
                     UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                }
            } else if(0x660<=c /* && c<=0x6f9 */) {
                // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 06F0..06F9 Then False;
                //  End For;
                //
                // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                // Rule Set:
                //  True;
                //  For All Characters:
                //    If cp .in. 0660..0669 Then False;
                //  End For;
                if(c<=0x669) {
                    if(arabicDigits>0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=-1;
                } else if(0x6f0<=c) {
                    if(arabicDigits<0) {
                        addLabelError(info, Error.CONTEXTO_DIGITS);
                    }
                    arabicDigits=1;
                }
            }
        } else if(c==0x30fb) {
            // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
            // Rule Set:
            //  False;
            //  For All Characters:
            //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
            //  End For;
            for(int j=labelStart;; j+=Character.charCount(c)) {
                if(j>labelEnd) {
                    addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    break;
                }
                c=Character.codePointAt(label, j);
                int script=UScript.getScript(c);
                if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
                    break;
                }
            }
        }
    }
}

Source File: AnyTransliterator.java From fitnotifications with Apache License 2.0

2 votes

/**
 * @param targetScript2
 * @return
 */
private boolean isWide(int script) {
    return script == UScript.BOPOMOFO || script == UScript.HAN || script == UScript.HANGUL || script == UScript.HIRAGANA || script == UScript.KATAKANA;
}

Java Code Examples for com.ibm.icu.lang.UScript#HIRAGANA