com.ibm.icu.text.BreakIterator Java Examples
The following examples show how to use
com.ibm.icu.text.BreakIterator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SegmenterObject.java From es6draft with MIT License | 6 votes |
private BreakIterator createBreakIterator() { ULocale locale = ULocale.forLanguageTag(this.locale); if ("line".equals(granularity)) { // "strictness" cannot be set through unicode extensions (u-lb-strict), handle here: locale = locale.setKeywordValue("lb", strictness); } BreakIterator breakIterator; switch (granularity) { case "grapheme": breakIterator = BreakIterator.getCharacterInstance(locale); break; case "word": breakIterator = BreakIterator.getWordInstance(locale); break; case "sentence": breakIterator = BreakIterator.getSentenceInstance(locale); break; case "line": breakIterator = BreakIterator.getLineInstance(locale); break; default: throw new AssertionError(); } return breakIterator; }
Example #2
Source File: LocaleDisplayNamesImpl.java From fitnotifications with Apache License 2.0 | 6 votes |
private String adjustForUsageAndContext(CapitalizationContextUsage usage, String name) { if (name != null && name.length() > 0 && UCharacter.isLowerCase(name.codePointAt(0)) && (capitalization==DisplayContext.CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (capitalizationUsage != null && capitalizationUsage[usage.ordinal()]) )) { // Note, won't have capitalizationUsage != null && capitalizationUsage[usage.ordinal()] // unless capitalization is CAPITALIZATION_FOR_UI_LIST_OR_MENU or CAPITALIZATION_FOR_STANDALONE synchronized (this) { if (capitalizationBrkIter == null) { // should only happen when deserializing, etc. capitalizationBrkIter = BreakIterator.getSentenceInstance(locale); } return UCharacter.toTitleCase(locale, name, capitalizationBrkIter, UCharacter.TITLECASE_NO_LOWERCASE | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT); } } return name; }
Example #3
Source File: SpellCheckIterator.java From Eclipse-Postfix-Code-Completion with Eclipse Public License 1.0 | 6 votes |
/** * Skip the tokens until the stop character is reached. * * @param begin the begin index * @param stop the stop character */ protected final void skipTokens(final int begin, final int stop) { final boolean isStoppingOnWhiteSpace= stop == WHITE_SPACE_TOKEN; int end= begin; while (end < fContent.length()) { char ch= fContent.charAt(end); if (ch == stop || isStoppingOnWhiteSpace && Character.isWhitespace(ch)) break; end++; } if (end < fContent.length()) { fNext= end; fPredecessor= fNext; fSuccessor= fWordIterator.following(fNext); } else fSuccessor= BreakIterator.DONE; }
Example #4
Source File: SimpleFilteredSentenceBreakIterator.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Given that the delegate has already given its "initial" answer, * find the NEXT actual (non-suppressed) break. * @param n initial position from delegate * @return new break position or BreakIterator.DONE */ private final int internalNext(int n) { if (n == BreakIterator.DONE || // at end or backwardsTrie == null) { // .. no backwards table loaded == no exceptions return n; } resetState(); final int textLen = text.getLength(); while (n != BreakIterator.DONE && n != textLen) { // outer loop runs once per underlying break (from fDelegate). // loops while 'n' points to an exception. if (breakExceptionAt(n)) { // n points to a break exception n = delegate.next(); } else { // no exception at this spot return n; } } return n; //hit underlying DONE or break at end of text }
Example #5
Source File: SimpleFilteredSentenceBreakIterator.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Given that the delegate has already given its "initial" answer, * find the PREV actual (non-suppressed) break. * @param n initial position from delegate * @return new break position or BreakIterator.DONE */ private final int internalPrev(int n) { if (n == 0 || n == BreakIterator.DONE || // at end or backwardsTrie == null) { // .. no backwards table loaded == no exceptions return n; } resetState(); while (n != BreakIterator.DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). // loops while 'n' points to an exception. if (breakExceptionAt(n)) { // n points to a break exception n = delegate.previous(); } else { // no exception at this spot return n; } } return n; //hit underlying DONE or break at end of text }
Example #6
Source File: LineBreaker.java From ttt with BSD 2-Clause "Simplified" License | 6 votes |
private LineBreakIterator maybeLoad(Reporter reporter) { LineBreakIterator iterator = this.iterator; if (iterator != null) return iterator; else { BreakIterator bi = null; InputStream is = null; try { URL rulesLocator = getRulesLocator(name, RULES_BINARY_EXT); if (rulesLocator != null) { is = rulesLocator.openStream(); bi = RuleBasedBreakIterator.getInstanceFromCompiledRules(is); reporter.logInfo(reporter.message("*KEY*", "Loaded rules based break iterator from ''{0}''.", rulesLocator.toString())); } else bi = BreakIterator.getCharacterInstance(); } catch (IOException e) { } finally { IOUtil.closeSafely(is); } if (bi != null) { return this.iterator = new LineBreakIterator(bi); } else return null; } }
Example #7
Source File: BreakIteratorWrapper.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
private int calcStatus(int current, int next) { if (current == BreakIterator.DONE || next == BreakIterator.DONE) { return RuleBasedBreakIterator.WORD_NONE; } int begin = start + current; int end = start + next; int codepoint; for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { codepoint = UTF16.charAt(text, 0, end, begin); if (UCharacter.isDigit(codepoint)) { return RuleBasedBreakIterator.WORD_NUMBER; } else if (UCharacter.isLetter(codepoint)) { return RuleBasedBreakIterator.WORD_LETTER; } } return RuleBasedBreakIterator.WORD_NONE; }
Example #8
Source File: CaseMapImpl.java From trekarta with GNU General Public License v3.0 | 6 votes |
public static String toTitle(int caseLocale, int options, BreakIterator iter, CharSequence src) { if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { if (src.length() == 0) { return src.toString(); } // Collect and apply only changes. // Good if no or few changes. Bad (slow) if many changes. Edits edits = new Edits(); StringBuilder replacementChars = toTitle( caseLocale, options | OMIT_UNCHANGED_TEXT, iter, src, new StringBuilder(), edits); return applyEdits(src, replacementChars, edits); } else { return toTitle(caseLocale, options, iter, src, new StringBuilder(src.length()), null).toString(); } }
Example #9
Source File: IcuTokenizer.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
private boolean incrementTokenBuffer() { int start = breaker.current(); if (start == BreakIterator.DONE) { throw new IllegalStateException(); } // find the next set of boundaries, skipping over non-tokens (rule status 0) int end = breaker.next(); while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) { start = end; end = breaker.next(); } if (end == BreakIterator.DONE) { return false; } termAtt.copyBuffer(buffer, start, end - start); offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end)); typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus())); scriptAtt.setCode(breaker.getScriptCode()); return true; }
Example #10
Source File: CaseMapImpl.java From trekarta with GNU General Public License v3.0 | 6 votes |
public static BreakIterator getTitleBreakIterator( ULocale locale, int options, BreakIterator iter) { options &= TITLECASE_ITERATOR_MASK; if (options != 0 && iter != null) { throw new IllegalArgumentException( "titlecasing iterator option together with an explicit iterator"); } if (iter == null) { switch (options) { case 0: iter = BreakIterator.getWordInstance(locale); break; case TITLECASE_WHOLE_STRING: iter = new WholeStringBreakIterator(); break; case TITLECASE_SENTENCES: iter = BreakIterator.getSentenceInstance(locale); break; default: throw new IllegalArgumentException("unknown titlecasing iterator option"); } } return iter; }
Example #11
Source File: CaseMapImpl.java From trekarta with GNU General Public License v3.0 | 6 votes |
public static BreakIterator getTitleBreakIterator( Locale locale, int options, BreakIterator iter) { options &= TITLECASE_ITERATOR_MASK; if (options != 0 && iter != null) { throw new IllegalArgumentException( "titlecasing iterator option together with an explicit iterator"); } if (iter == null) { switch (options) { case 0: iter = BreakIterator.getWordInstance(locale); break; case TITLECASE_WHOLE_STRING: iter = new WholeStringBreakIterator(); break; case TITLECASE_SENTENCES: iter = BreakIterator.getSentenceInstance(locale); break; default: throw new IllegalArgumentException("unknown titlecasing iterator option"); } } return iter; }
Example #12
Source File: BreakIteratorWrapper.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
/** * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its * any other BreakIterator, the rulestatus method is not available, so treat * it like a generic BreakIterator. */ static BreakIteratorWrapper wrap(BreakIterator breakIterator) { if (breakIterator instanceof RuleBasedBreakIterator) { return new RBBIWrapper((RuleBasedBreakIterator) breakIterator); } else { return new BIWrapper(breakIterator); } }
Example #13
Source File: JavaEditor.java From Eclipse-Postfix-Code-Completion with Eclipse Public License 1.0 | 5 votes |
@Override public void run() { // Check whether we are in a java code partition and the preference is enabled final IPreferenceStore store= getPreferenceStore(); if (!store.getBoolean(PreferenceConstants.EDITOR_SUB_WORD_NAVIGATION)) { super.run(); return; } final ISourceViewer viewer= getSourceViewer(); final IDocument document= viewer.getDocument(); try { fIterator.setText((CharacterIterator)new DocumentCharacterIterator(document)); int position= widgetOffset2ModelOffset(viewer, viewer.getTextWidget().getCaretOffset()); if (position == -1) return; int previous= findPreviousPosition(position); if (isBlockSelectionModeEnabled() && document.getLineOfOffset(previous) != document.getLineOfOffset(position)) { super.run(); // may navigate into virtual white space } else if (previous != BreakIterator.DONE) { setCaretPosition(previous); getTextWidget().showSelection(); fireSelectionChanged(); } } catch (BadLocationException x) { // ignore - getLineOfOffset failed } }
Example #14
Source File: GlobalizationPreferences.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * Get a copy of the break iterator for the specified type according to the * settings. * * @param type break type - BI_CHARACTER or BI_WORD, BI_LINE, BI_SENTENCE, BI_TITLE * @return break iterator explicit or implicit * @draft ICU 3.6 * @provisional This API might change or be removed in a future release. */ public BreakIterator getBreakIterator(int type) { if (type < BI_CHARACTER || type >= BI_LIMIT) { throw new IllegalArgumentException("Illegal break iterator type"); } if (breakIterators == null || breakIterators[type] == null) { return guessBreakIterator(type); } return (BreakIterator) breakIterators[type].clone(); // clone for safety }
Example #15
Source File: SpellCheckIterator.java From Eclipse-Postfix-Code-Completion with Eclipse Public License 1.0 | 5 votes |
/** * Creates a new spell check iterator. * * @param document the document containing the specified partition * @param region the region to spell check * @param locale the locale to use for spell checking * @param breakIterator the break-iterator */ public SpellCheckIterator(IDocument document, IRegion region, Locale locale, BreakIterator breakIterator) { fOffset= region.getOffset(); fWordIterator= breakIterator; fDelimiter= TextUtilities.getDefaultLineDelimiter(document); String content; try { content= document.get(region.getOffset(), region.getLength()); if (content.startsWith(NLSElement.TAG_PREFIX)) content= ""; //$NON-NLS-1$ } catch (Exception exception) { content= ""; //$NON-NLS-1$ } fContent= content; fWordIterator.setText(content); fPredecessor= fWordIterator.first(); fSuccessor= fWordIterator.next(); final BreakIterator iterator= BreakIterator.getSentenceInstance(locale); iterator.setText(content); int offset= iterator.current(); while (offset != BreakIterator.DONE) { fSentenceBreaks.add(new Integer(offset)); offset= iterator.next(); } }
Example #16
Source File: DefaultIcuTokenizerConfig.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
@Override public BreakIterator getBreakIterator(int script) { switch (script) { case UScript.MYANMAR: if (myanmarAsWords) { return (BreakIterator) defaultBreakIterator.clone(); } else { return (BreakIterator) myanmarSyllableIterator.clone(); } case UScript.JAPANESE: return (BreakIterator) cjkBreakIterator.clone(); default: return (BreakIterator) defaultBreakIterator.clone(); } }
Example #17
Source File: JavaEditor.java From Eclipse-Postfix-Code-Completion with Eclipse Public License 1.0 | 5 votes |
/** * Finds the previous position before the given position. * * @param position the current position * @return the previous position */ protected int findPreviousPosition(int position) { ISourceViewer viewer= getSourceViewer(); int widget= -1; int previous= position; while (previous != BreakIterator.DONE && widget == -1) { // XXX: optimize previous= fIterator.preceding(previous); if (previous != BreakIterator.DONE) widget= modelOffset2WidgetOffset(viewer, previous); } IDocument document= viewer.getDocument(); LinkedModeModel model= LinkedModeModel.getModel(document, position); if (model != null && previous != BreakIterator.DONE) { LinkedPosition linkedPosition= model.findPosition(new LinkedPosition(document, position, 0)); if (linkedPosition != null) { int linkedPositionOffset= linkedPosition.getOffset(); if (position != linkedPositionOffset && previous < linkedPositionOffset) previous= linkedPositionOffset; } else { LinkedPosition previousLinkedPosition= model.findPosition(new LinkedPosition(document, previous, 0)); if (previousLinkedPosition != null) { int previousLinkedPositionEnd= previousLinkedPosition.getOffset() + previousLinkedPosition.getLength(); if (position != previousLinkedPositionEnd && previous < previousLinkedPositionEnd) previous= previousLinkedPositionEnd; } } } return previous; }
Example #18
Source File: WordRecognizerWrapper.java From birt with Eclipse Public License 1.0 | 5 votes |
public Word getNextWord( ) { start = end; end = breakIterator.next( ); if ( end != BreakIterator.DONE ) { return new Word( text, start, end ); } return null; }
Example #19
Source File: ICUWordRecognizer.java From birt with Eclipse Public License 1.0 | 5 votes |
public ICUWordRecognizer( String text, Locale locale ) { if(locale!=null) { wordBreaker = BreakIterator.getWordInstance(locale); } else { wordBreaker = BreakIterator.getWordInstance(Locale.getDefault( )); } this.text = text; wordBreaker.setText( text); }
Example #20
Source File: ICUWordRecognizer.java From birt with Eclipse Public License 1.0 | 5 votes |
public Word getNextWord( ) { int start = wordBreaker.current( ); end = wordBreaker.next( ); if(end!=BreakIterator.DONE) { return new Word(text, start, end ); } else { return null; } }
Example #21
Source File: SegmenterObject.java From es6draft with MIT License | 5 votes |
/** * Returns the ICU {@link BreakIterator} instance. * * @return the BreakIterator instance */ public BreakIterator getBreakIterator() { if (breakIterator == null) { breakIterator = createBreakIterator(); } return (BreakIterator) breakIterator.clone(); }
Example #22
Source File: TitleCaseConverter.java From tutorials with MIT License | 5 votes |
public static String convertToTitleCaseIcu4j(String text) { if (text == null || text.isEmpty()) { return text; } return UCharacter.toTitleCase(text, BreakIterator.getTitleInstance()); }
Example #23
Source File: JavaEditor.java From Eclipse-Postfix-Code-Completion with Eclipse Public License 1.0 | 5 votes |
@Override public void run() { // Check whether we are in a java code partition and the preference is enabled final IPreferenceStore store= getPreferenceStore(); if (!store.getBoolean(PreferenceConstants.EDITOR_SUB_WORD_NAVIGATION)) { super.run(); return; } final ISourceViewer viewer= getSourceViewer(); final IDocument document= viewer.getDocument(); try { fIterator.setText((CharacterIterator)new DocumentCharacterIterator(document)); int position= widgetOffset2ModelOffset(viewer, viewer.getTextWidget().getCaretOffset()); if (position == -1) return; int next= findNextPosition(position); if (isBlockSelectionModeEnabled() && document.getLineOfOffset(next) != document.getLineOfOffset(position)) { super.run(); // may navigate into virtual white space } else if (next != BreakIterator.DONE) { setCaretPosition(next); getTextWidget().showSelection(); fireSelectionChanged(); } } catch (BadLocationException x) { // ignore } }
Example #24
Source File: GlobalizationPreferences.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * Explicitly set the break iterator for this object. * * @param type break type - BI_CHARACTER or BI_WORD, BI_LINE, BI_SENTENCE, BI_TITLE * @param iterator a break iterator * @return this, for chaining * @draft ICU 3.6 * @provisional This API might change or be removed in a future release. */ public GlobalizationPreferences setBreakIterator(int type, BreakIterator iterator) { if (type < BI_CHARACTER || type >= BI_LIMIT) { throw new IllegalArgumentException("Illegal break iterator type"); } if (isFrozen()) { throw new UnsupportedOperationException("Attempt to modify immutable object"); } if (breakIterators == null) breakIterators = new BreakIterator[BI_LIMIT]; breakIterators[type] = (BreakIterator) iterator.clone(); // clone for safety return this; }
Example #25
Source File: GlobalizationPreferences.java From fitnotifications with Apache License 2.0 | 5 votes |
/** * This function can be overridden by subclasses to use different heuristics. * <b>It MUST return a 'safe' value, * one whose modification will not affect this object.</b> * * @param type * @draft ICU 3.6 * @provisional This API might change or be removed in a future release. */ protected BreakIterator guessBreakIterator(int type) { BreakIterator bitr = null; ULocale brkLocale = getAvailableLocale(TYPE_BREAKITERATOR); if (brkLocale == null) { brkLocale = ULocale.ROOT; } switch (type) { case BI_CHARACTER: bitr = BreakIterator.getCharacterInstance(brkLocale); break; case BI_TITLE: bitr = BreakIterator.getTitleInstance(brkLocale); break; case BI_WORD: bitr = BreakIterator.getWordInstance(brkLocale); break; case BI_LINE: bitr = BreakIterator.getLineInstance(brkLocale); break; case BI_SENTENCE: bitr = BreakIterator.getSentenceInstance(brkLocale); break; default: throw new IllegalArgumentException("Unknown break iterator type"); } return bitr; }
Example #26
Source File: RelativeDateFormat.java From fitnotifications with Apache License 2.0 | 5 votes |
@Override public void setContext(DisplayContext context) { super.setContext(context); if (!capitalizationInfoIsSet && (context==DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU || context==DisplayContext.CAPITALIZATION_FOR_STANDALONE)) { initCapitalizationContextInfo(fLocale); capitalizationInfoIsSet = true; } if (capitalizationBrkIter == null && (context==DisplayContext.CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (context==DisplayContext.CAPITALIZATION_FOR_UI_LIST_OR_MENU && capitalizationOfRelativeUnitsForListOrMenu) || (context==DisplayContext.CAPITALIZATION_FOR_STANDALONE && capitalizationOfRelativeUnitsForStandAlone) )) { capitalizationBrkIter = BreakIterator.getSentenceInstance(fLocale); } }
Example #27
Source File: BreakIteratorWrapper.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Returns current rule status for the text between breaks. (determines token type) */ private int calcStatus(int current, int next) { // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing. // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i= if (next != BreakIterator.DONE && isEmoji(current, next)) { return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS; } else { return rbbi.getRuleStatus(); } }
Example #28
Source File: ICUTokenizerFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException { StringBuilder rules = new StringBuilder(); InputStream rulesStream = loader.openResource(filename); BufferedReader reader = new BufferedReader (IOUtils.getDecodingReader(rulesStream, StandardCharsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { if ( ! line.startsWith("#")) rules.append(line); rules.append('\n'); } reader.close(); return new RuleBasedBreakIterator(rules.toString()); }
Example #29
Source File: JavaEditor.java From Eclipse-Postfix-Code-Completion with Eclipse Public License 1.0 | 5 votes |
/** * Finds the next position after the given position. * * @param position the current position * @return the next position */ protected int findNextPosition(int position) { ISourceViewer viewer= getSourceViewer(); int widget= -1; int next= position; while (next != BreakIterator.DONE && widget == -1) { // XXX: optimize next= fIterator.following(next); if (next != BreakIterator.DONE) widget= modelOffset2WidgetOffset(viewer, next); } IDocument document= viewer.getDocument(); LinkedModeModel model= LinkedModeModel.getModel(document, position); if (model != null && next != BreakIterator.DONE) { LinkedPosition linkedPosition= model.findPosition(new LinkedPosition(document, position, 0)); if (linkedPosition != null) { int linkedPositionEnd= linkedPosition.getOffset() + linkedPosition.getLength(); if (position != linkedPositionEnd && linkedPositionEnd < next) next= linkedPositionEnd; } else { LinkedPosition nextLinkedPosition= model.findPosition(new LinkedPosition(document, next, 0)); if (nextLinkedPosition != null) { int nextLinkedPositionOffset= nextLinkedPosition.getOffset(); if (position != nextLinkedPositionOffset && nextLinkedPositionOffset < next) next= nextLinkedPositionOffset; } } } return next; }
Example #30
Source File: RenamingNameSuggestor.java From Eclipse-Postfix-Code-Completion with Eclipse Public License 1.0 | 5 votes |
/** * Grab a list of camelCase-separated suffixes from the typeName, for * example: * * "JavaElementName" => { "Java", "Element", "Name } * * "ASTNode" => { "AST", "Node" } * */ private String[] getSuffixes(String typeName) { List<String> suffixes= new ArrayList<String>(); JavaWordIterator iterator= new JavaWordIterator(); iterator.setText(typeName); int lastmatch= 0; int match; while ( (match= iterator.next()) != BreakIterator.DONE) { suffixes.add(typeName.substring(lastmatch, match)); lastmatch= match; } return suffixes.toArray(new String[0]); }