Java Code Examples for java.text.BreakIterator#next()
The following examples show how to use
java.text.BreakIterator#next() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TextComponent.java From openjdk-jdk8u with GNU General Public License v2.0 | 6 votes |
/** * Needed to unify forward and backward searching. * The method assumes that s is the text assigned to words. */ private int findWordLimit(int index, BreakIterator words, boolean direction, String s) { // Fix for 4256660 and 4256661. // Words iterator is different from character and sentence iterators // in that end of one word is not necessarily start of another word. // Please see java.text.BreakIterator JavaDoc. The code below is // based on nextWordStartAfter example from BreakIterator.java. int last = (direction == NEXT) ? words.following(index) : words.preceding(index); int current = (direction == NEXT) ? words.next() : words.previous(); while (current != BreakIterator.DONE) { for (int p = Math.min(last, current); p < Math.max(last, current); p++) { if (Character.isLetter(s.charAt(p))) { return last; } } last = current; current = (direction == NEXT) ? words.next() : words.previous(); } return BreakIterator.DONE; }
Example 2
Source File: TextComponent.java From JDKSourceCode1.8 with MIT License | 6 votes |
/** * Needed to unify forward and backward searching. * The method assumes that s is the text assigned to words. */ private int findWordLimit(int index, BreakIterator words, boolean direction, String s) { // Fix for 4256660 and 4256661. // Words iterator is different from character and sentence iterators // in that end of one word is not necessarily start of another word. // Please see java.text.BreakIterator JavaDoc. The code below is // based on nextWordStartAfter example from BreakIterator.java. int last = (direction == NEXT) ? words.following(index) : words.preceding(index); int current = (direction == NEXT) ? words.next() : words.previous(); while (current != BreakIterator.DONE) { for (int p = Math.min(last, current); p < Math.max(last, current); p++) { if (Character.isLetter(s.charAt(p))) { return last; } } last = current; current = (direction == NEXT) ? words.next() : words.previous(); } return BreakIterator.DONE; }
Example 3
Source File: TextComponent.java From openjdk-jdk9 with GNU General Public License v2.0 | 6 votes |
/** * Needed to unify forward and backward searching. * The method assumes that s is the text assigned to words. */ private int findWordLimit(int index, BreakIterator words, boolean direction, String s) { // Fix for 4256660 and 4256661. // Words iterator is different from character and sentence iterators // in that end of one word is not necessarily start of another word. // Please see java.text.BreakIterator JavaDoc. The code below is // based on nextWordStartAfter example from BreakIterator.java. int last = (direction == NEXT) ? words.following(index) : words.preceding(index); int current = (direction == NEXT) ? words.next() : words.previous(); while (current != BreakIterator.DONE) { for (int p = Math.min(last, current); p < Math.max(last, current); p++) { if (Character.isLetter(s.charAt(p))) { return last; } } last = current; current = (direction == NEXT) ? words.next() : words.previous(); } return BreakIterator.DONE; }
Example 4
Source File: SimpleTokenAndSentenceAnnotator.java From uima-uimaj with Apache License 2.0 | 6 votes |
void makeAnnotations(Maker m, BreakIterator b) { b.setText(input); for (int end = b.next(), start = b.first(); end != BreakIterator.DONE; start = end, end = b .next()) { // eliminate all-whitespace tokens boolean isWhitespace = true; for (int i = start; i < end; i++) { if (!Character.isWhitespace(input.charAt(i))) { isWhitespace = false; break; } } if (!isWhitespace) { m.newAnnotation(jcas, start, end).addToIndexes(); } } }
Example 5
Source File: BreakIteratorTest.java From TencentKona-8 with GNU General Public License v2.0 | 5 votes |
/** * Bug 4068137 */ public void TestEndBehavior() { String testString = "boo."; BreakIterator wb = BreakIterator.getWordInstance(); wb.setText(testString); if (wb.first() != 0) errln("Didn't get break at beginning of string."); if (wb.next() != 3) errln("Didn't get break before period in \"boo.\""); if (wb.current() != 4 && wb.next() != 4) errln("Didn't get break at end of string."); }
Example 6
Source File: TestSplittingBreakIterator.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Returns a string comprised of spaces and '^' only at the boundaries. */ private String readBoundariesToString(BreakIterator bi, String text) { // init markers to spaces StringBuilder markers = new StringBuilder(); markers.setLength(text.length() + 1); for (int k = 0; k < markers.length(); k++) { markers.setCharAt(k, ' '); } bi.setText(text); for (int boundary = bi.current(); boundary != BreakIterator.DONE; boundary = bi.next()) { markers.setCharAt(boundary, '^'); } return markers.toString(); }
Example 7
Source File: SexpBaseForwardHandler.java From e4macs with Eclipse Public License 1.0 | 5 votes |
/** * @see com.mulgasoft.emacsplus.commands.SexpHandler#getNextPosition(org.eclipse.jface.text.IDocument, java.text.BreakIterator) */ @Override protected int getNextPosition(IDocument document, BreakIterator iter) { int pos = iter.current(); int result = iter.next(); if (result != BreakIterator.DONE) { result = checkDot(document,pos,result); result = checkUnder(document,result); } return result; }
Example 8
Source File: DictionaryResource.java From newsleak with GNU Affero General Public License v3.0 | 5 votes |
/** * Checks if a String is a multi word unit. * * @param t * the t * @return true, if is multi word */ private boolean isMultiWord(String t) { BreakIterator tokenBreaker = BreakIterator.getWordInstance(locale); tokenBreaker.setText(t); // count tokens int pos = tokenBreaker.first(); int nTokens = 0; while (pos != BreakIterator.DONE) { nTokens++; pos = tokenBreaker.next(); } nTokens = nTokens / 2; return nTokens > 1; }
Example 9
Source File: MirroredBreakIterator.java From jdk8u-jdk with GNU General Public License v2.0 | 5 votes |
MirroredBreakIterator(BreakIterator bi) { List<Integer> b = new ArrayList<Integer>(); int i = bi.first(); charIndex = i; for (; i != DONE; i = bi.next()) { b.add(i); } boundaries = Collections.unmodifiableList(b); }
Example 10
Source File: MirroredBreakIterator.java From jdk8u-jdk with GNU General Public License v2.0 | 5 votes |
MirroredBreakIterator(BreakIterator bi) { List<Integer> b = new ArrayList<Integer>(); int i = bi.first(); charIndex = i; for (; i != DONE; i = bi.next()) { b.add(i); } boundaries = Collections.unmodifiableList(b); }
Example 11
Source File: NavigationActions.java From RichTextFX with BSD 2-Clause "Simplified" License | 5 votes |
/** * Skips n number of word boundaries forward. */ default void wordBreaksForwards(int n, SelectionPolicy selectionPolicy) { if(getLength() == 0) { return; } BreakIterator wordBreakIterator = BreakIterator.getWordInstance(); wordBreakIterator.setText(getText()); wordBreakIterator.following(getCaretPosition()); for (int i = 1; i < n; i++) { wordBreakIterator.next(); } moveTo(wordBreakIterator.current(), selectionPolicy); }
Example 12
Source File: MtasDocumentIndex.java From inception with Apache License 2.0 | 5 votes |
private String preprocessQuery(String aQuery) { String result; if (!(aQuery.contains("\"") || aQuery.contains("[") || aQuery.contains("]") || aQuery.contains("{") || aQuery.contains("}") || aQuery.contains("<") || aQuery.contains(">"))) { // Convert raw words query to a Mtas CQP query result = ""; BreakIterator words = BreakIterator.getWordInstance(); words.setText(aQuery); int start = words.first(); int end = words.next(); while (end != BreakIterator.DONE) { String word = aQuery.substring(start, end); if (!word.trim().isEmpty()) { // Add the word to the query result += "\"" + word + "\""; } start = end; end = words.next(); if (end != BreakIterator.DONE) { result += " "; } } } else { result = aQuery; } return result; }
Example 13
Source File: DatePicker.java From nebula with Eclipse Public License 2.0 | 5 votes |
/** * set / update the text of the displayLabels. these are the Week column * headers above the days on the Calendar part of the <code>CDateTime</code> * . */ private void updateDaysOfWeek() { if (dayPanel != null) { Calendar tmpcal = cdt.getCalendarInstance(); tmpcal.set(Calendar.DAY_OF_WEEK, tmpcal.getFirstDayOfWeek()); Locale locale = cdt.getLocale(); boolean ltr = ComponentOrientation.getOrientation(locale) .isLeftToRight() && !locale.getLanguage().equals("zh"); //$NON-NLS-1$ BreakIterator iterator = BreakIterator.getCharacterInstance(locale); for (VLabel dayLabel : dayLabels) { String str = getFormattedDate("E", tmpcal.getTime()); //$NON-NLS-1$ if (dayLabel.getData(CDT.Key.Compact, Boolean.class)) { iterator.setText(str); int start, end; if (ltr) { start = iterator.first(); end = iterator.next(); } else { end = iterator.last(); start = iterator.previous(); } dayLabel.setText(str.substring(start, end)); } else { dayLabel.setText(str); } tmpcal.add(Calendar.DAY_OF_WEEK, 1); } } }
Example 14
Source File: BaseUtilities.java From netbeans with Apache License 2.0 | 4 votes |
/** Wrap multi-line strings (and get the individual lines). * @param original the original string to wrap * @param width the maximum width of lines * @param breakIterator breaks original to chars, words, sentences, depending on what instance you provide. * @param removeNewLines if <code>true</code>, any newlines in the original string are ignored * @return the lines after wrapping */ public static String[] wrapStringToArray( String original, int width, BreakIterator breakIterator, boolean removeNewLines ) { if (original.length() == 0) { return new String[] { original }; } String[] workingSet; // substitute original newlines with spaces, // remove newlines from head and tail if (removeNewLines) { original = trimString(original); original = original.replace('\n', ' '); workingSet = new String[] { original }; } else { StringTokenizer tokens = new StringTokenizer(original, "\n"); // NOI18N int len = tokens.countTokens(); workingSet = new String[len]; for (int i = 0; i < len; i++) { workingSet[i] = tokens.nextToken(); } } if (width < 1) { width = 1; } if (original.length() <= width) { return workingSet; } widthcheck: { boolean ok = true; for (int i = 0; i < workingSet.length; i++) { ok = ok && (workingSet[i].length() < width); if (!ok) { break widthcheck; } } return workingSet; } java.util.ArrayList<String> lines = new java.util.ArrayList<String>(); int lineStart = 0; // the position of start of currently processed line in the original string for (int i = 0; i < workingSet.length; i++) { if (workingSet[i].length() < width) { lines.add(workingSet[i]); } else { breakIterator.setText(workingSet[i]); int nextStart = breakIterator.next(); int prevStart = 0; do { while (((nextStart - lineStart) < width) && (nextStart != BreakIterator.DONE)) { prevStart = nextStart; nextStart = breakIterator.next(); } if (nextStart == BreakIterator.DONE) { nextStart = prevStart = workingSet[i].length(); } if (prevStart == 0) { prevStart = nextStart; } lines.add(workingSet[i].substring(lineStart, prevStart)); lineStart = prevStart; prevStart = 0; } while (lineStart < workingSet[i].length()); lineStart = 0; } } String[] s = new String[lines.size()]; return lines.toArray(s); }
Example 15
Source File: PlainText.java From gcs with Mozilla Public License 2.0 | 4 votes |
/** * Break the paragraph into individual lines. * * @param font the font used for rendering the text. * @param fontSize the fontSize used for rendering the text. * @param width the width of the box holding the content. * @return the individual lines. * @throws IOException */ List<Line> getLines(PDFont font, float fontSize, float width) throws IOException { BreakIterator iterator = BreakIterator.getLineInstance(); iterator.setText(textContent); final float scale = fontSize/FONTSCALE; int start = iterator.first(); int end = iterator.next(); float lineWidth = 0; List<Line> textLines = new ArrayList<Line>(); Line textLine = new Line(); while (end != BreakIterator.DONE) { String word = textContent.substring(start,end); float wordWidth = font.getStringWidth(word) * scale; lineWidth = lineWidth + wordWidth; // check if the last word would fit without the whitespace ending it if (lineWidth >= width && Character.isWhitespace(word.charAt(word.length()-1))) { float whitespaceWidth = font.getStringWidth(word.substring(word.length()-1)) * scale; lineWidth = lineWidth - whitespaceWidth; } if (lineWidth >= width) { textLine.setWidth(textLine.calculateWidth(font, fontSize)); textLines.add(textLine); textLine = new Line(); lineWidth = font.getStringWidth(word) * scale; } AttributedString as = new AttributedString(word); as.addAttribute(TextAttribute.WIDTH, wordWidth); Word wordInstance = new Word(word); wordInstance.setAttributes(as); textLine.addWord(wordInstance); start = end; end = iterator.next(); } textLine.setWidth(textLine.calculateWidth(font, fontSize)); textLines.add(textLine); return textLines; }
Example 16
Source File: BreakIteratorTest.java From TencentKona-8 with GNU General Public License v2.0 | 4 votes |
/** * Bug 4638433 */ public void TestLineBreakBasedOnUnicode3_0_0() { BreakIterator iter; int i; /* Latin Extend-B characters * 0x0218-0x0233 which have been added since Unicode 3.0.0. */ iter = BreakIterator.getWordInstance(Locale.US); iter.setText("\u0216\u0217\u0218\u0219\u021A"); i = iter.first(); i = iter.next(); if (i != 5) { errln("Word break failure: failed to stop at 5 and bounded at " + i); } iter = BreakIterator.getLineInstance(Locale.US); /* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)> * \u301f has changed its category from Ps to Pe since Unicode 2.1. */ iter.setText("32\u301f1"); i = iter.first(); i = iter.next(); if (i != 3) { errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i); } /* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)> * which have been added since Unicode 3.0.0. */ iter.setText("\u1820\u1806\u1821"); i = iter.first(); i = iter.next(); if (i != 2) { errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i); } /* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have * been added since Unicode 3.0.0. */ iter.setText("\u17E0\u17DB\u17E1"); i = iter.first(); i = iter.next(); if (i != 1) { errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i); } i = iter.next(); if (i != 3) { errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i); } /* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have * been added since Unicode 3.0.0. */ iter.setText("\u1692\u1680\u1696"); i = iter.first(); i = iter.next(); if (i != 2) { errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i); } // Confirm changes in BreakIteratorRules_th.java have been reflected. iter = BreakIterator.getLineInstance(new Locale("th", "")); /* Thai <Seven(Nd)> * <Left Double Quotation Mark(Pi)> * <Five(Nd)> * <Right Double Quotation Mark(Pf)> * <Three(Nd)> */ iter.setText("\u0E57\u201C\u0E55\u201D\u0E53"); i = iter.first(); i = iter.next(); if (i != 1) { errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i); } i = iter.next(); if (i != 4) { errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i); } }
Example 17
Source File: Utils.java From moa with GNU General Public License v3.0 | 4 votes |
/** * Breaks up the string, if wider than "columns" characters. * * @param s the string to process * @param columns the width in columns * @return the processed string */ public static String[] breakUp(String s, int columns) { Vector<String> result; String line; BreakIterator boundary; int boundaryStart; int boundaryEnd; String word; String punctuation; int i; String[] lines; result = new Vector<String>(); punctuation = " .,;:!?'\""; lines = s.split("\n"); for (i = 0; i < lines.length; i++) { boundary = BreakIterator.getWordInstance(); boundary.setText(lines[i]); boundaryStart = boundary.first(); boundaryEnd = boundary.next(); line = ""; while (boundaryEnd != BreakIterator.DONE) { word = lines[i].substring(boundaryStart, boundaryEnd); if (line.length() >= columns) { if (word.length() == 1) { if (punctuation.indexOf(word.charAt(0)) > -1) { line += word; word = ""; } } result.add(line); line = ""; } line += word; boundaryStart = boundaryEnd; boundaryEnd = boundary.next(); } if (line.length() > 0) result.add(line); } return result.toArray(new String[result.size()]); }
Example 18
Source File: BreakIteratorTest.java From dragonwell8_jdk with GNU General Public License v2.0 | 4 votes |
private void doBreakInvariantTest(BreakIterator tb, String testChars) { StringBuffer work = new StringBuffer("aaa"); int errorCount = 0; // a break should always occur after CR (unless followed by LF), LF, PS, and LS String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028"; // change this back when new BI code is added for (int i = 0; i < breaks.length(); i++) { work.setCharAt(1, breaks.charAt(i)); for (int j = 0; j < testChars.length(); j++) { work.setCharAt(0, testChars.charAt(j)); for (int k = 0; k < testChars.length(); k++) { char c = testChars.charAt(k); // if a cr is followed by lf, don't do the check (they stay together) if (work.charAt(1) == '\r' && (c == '\n')) continue; // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored // for breaking purposes as per UTR14 int type1 = Character.getType(work.charAt(1)); int type2 = Character.getType(c); if (type1 == Character.CONTROL || type1 == Character.FORMAT || type2 == Character.CONTROL || type2 == Character.FORMAT) { continue; } work.setCharAt(2, c); tb.setText(work.toString()); boolean seen2 = false; for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) { if (l == 2) seen2 = true; } if (!seen2) { errln("No break between U+" + Integer.toHexString((int)(work.charAt(1))) + " and U+" + Integer.toHexString((int)(work.charAt(2)))); errorCount++; if (errorCount >= 75) return; } } } } }
Example 19
Source File: BreakIteratorTest.java From openjdk-jdk9 with GNU General Public License v2.0 | 4 votes |
private void doOtherInvariantTest(BreakIterator tb, String testChars) { StringBuffer work = new StringBuffer("a\r\na"); int errorCount = 0; // a break should never occur between CR and LF for (int i = 0; i < testChars.length(); i++) { work.setCharAt(0, testChars.charAt(i)); for (int j = 0; j < testChars.length(); j++) { work.setCharAt(3, testChars.charAt(j)); tb.setText(work.toString()); for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next()) if (k == 2) { errln("Break between CR and LF in string U+" + Integer.toHexString( (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString( (int)(work.charAt(3)))); errorCount++; if (errorCount >= 75) return; } } } // a break should never occur before a non-spacing mark, unless it's preceded // by a line terminator work.setLength(0); work.append("aaaa"); for (int i = 0; i < testChars.length(); i++) { char c = testChars.charAt(i); if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003') continue; work.setCharAt(1, c); for (int j = 0; j < testChars.length(); j++) { c = testChars.charAt(j); if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c) != Character.ENCLOSING_MARK) continue; work.setCharAt(2, c); // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored // for breaking purposes as per UTR14 int type1 = Character.getType(work.charAt(1)); int type2 = Character.getType(work.charAt(2)); if (type1 == Character.CONTROL || type1 == Character.FORMAT || type2 == Character.CONTROL || type2 == Character.FORMAT) { continue; } tb.setText(work.toString()); for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next()) if (k == 2) { errln("Break between U+" + Integer.toHexString((int)(work.charAt(1))) + " and U+" + Integer.toHexString((int)(work.charAt(2)))); errorCount++; if (errorCount >= 75) return; } } } }
Example 20
Source File: StemmingLemaEx.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 4 votes |
public static void main(String args[]){ String words[] = {"bank", "banking", "banks", "banker", "banked", "bankart"}; PorterStemmer ps = new PorterStemmer(); for(String w : words){ String stem = ps.stem(w); System.out.println("Word : " + w + " Stem : " + stem); } String paragraph = "When determining the end of sentences " + "we need to consider several factors. Sentences may end with " + "exclamation marks! Or possibly questions marks? Within " + "sentences we may find numbers like 3.14159, abbreviations " + "such as found in Mr. Smith, and possibly ellipses either " + "within a sentence …, or at the end of a sentence…"; String simple = "[.?!]"; String[] splitString = (paragraph.split(simple)); for (String string : splitString) { System.out.println(string); } System.out.println("-------------Using Pattern and Matcher-------------"); Pattern sentencePattern = Pattern.compile( "# Match a sentence ending in punctuation or EOS.\n" + "[^.!?\\s] # First char is non-punct, non-ws\n" + "[^.!?]* # Greedily consume up to punctuation.\n" + "(?: # Group for unrolling the loop.\n" + " [.!?] # (special) inner punctuation ok if\n" + " (?!['\"]?\\s|$) # not followed by ws or EOS.\n" + " [^.!?]* # Greedily consume up to punctuation.\n" + ")* # Zero or more (special normal*)\n" + "[.!?]? # Optional ending punctuation.\n" + "['\"]? # Optional closing quote.\n" + "(?=\\s|$)", Pattern.MULTILINE | Pattern.COMMENTS); Matcher matcher = sentencePattern.matcher(paragraph); while (matcher.find()) { System.out.println(matcher.group()); } System.out.println("-------------Using BreakIterator-------------"); BreakIterator si = BreakIterator.getSentenceInstance(); Locale cl = new Locale("en", "US"); si.setText(paragraph); int boundary = si.first(); while(boundary!=BreakIterator.DONE){ int begin = boundary; System.out.println(boundary + " - "); boundary = si.next(); int end = boundary; if(end == BreakIterator.DONE){ break; } System.out.println(boundary + " [ " + paragraph.substring(begin,end) + " ] "); } System.out.println("-------------Using SentenceDetectorME-------------"); try{ InputStream is = new FileInputStream(new File("/home/ashish/Downloads/" + "en-sent.bin")); SentenceModel sm = new SentenceModel(is); SentenceDetectorME detector = new SentenceDetectorME(sm); String sentences [] = detector.sentDetect(paragraph); for(String s : sentences){ System.out.println(s); } } catch(IOException e){ System.out.println("Error Detected" + e); e.printStackTrace(); } }