Java Code Examples for org.apache.lucene.analysis.Tokenizer#reset()
The following examples show how to use
org.apache.lucene.analysis.Tokenizer#reset() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testBigLookahead() throws Exception { StringBuilder b = new StringBuilder(); for(int i=0;i<100;i++) { b.append('a'); } b.append('b'); Tokenizer t = new SimplePatternSplitTokenizer(b.toString()); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); b = new StringBuilder(); for(int i=0;i<200;i++) { b.append('a'); } t.setReader(new StringReader(b.toString())); t.reset(); assertTrue(t.incrementToken()); assertEquals(b.toString(), termAtt.toString()); assertFalse(t.incrementToken()); }
Example 2
Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0 | 6 votes |
private ArrayList<char[]> tokenize( String input ) throws IOException { Log.debug( "tokenize '" + input + "'" ); ArrayList<char[]> tokens = new ArrayList<char[]>( ); Tokenizer tk = getTokenizerImpl( input ); CharTermAttribute term = tk.addAttribute( CharTermAttribute.class ); tk.reset( ); while (tk.incrementToken( ) ) { int bufLen = term.length(); char[] copy = new char[ bufLen ]; System.arraycopy(term.buffer( ), 0, copy, 0, bufLen ); tokens.add( copy ); } return tokens; }
Example 3
Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0 | 6 votes |
private ArrayList<char[]> tokenize( String input ) throws IOException { Log.debug( "tokenize '" + input + "'" ); ArrayList<char[]> tokens = new ArrayList<char[]>( ); Tokenizer tk = getTokenizerImpl( input ); CharTermAttribute term = tk.addAttribute( CharTermAttribute.class ); tk.reset( ); while (tk.incrementToken( ) ) { int bufLen = term.length(); char[] copy = new char[ bufLen ]; System.arraycopy(term.buffer( ), 0, copy, 0, bufLen ); tokens.add( copy ); } return tokens; }
Example 4
Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 5
Source File: NlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testSegmenter() throws Exception { Tokenizer segmenter = getSegmenter(); String text = "中华人民共和国(People's Republic of China),简称'中国'"; segmenter.setReader(new StringReader(text)); segmenter.reset(); while (segmenter.incrementToken()) { // 词元 CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute type = segmenter.getAttribute(TypeAttribute.class); LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset())); Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase()); } }
Example 6
Source File: MeCabKoStandardTokenizerTest.java From mecab-ko-lucene-analyzer with Apache License 2.0 | 6 votes |
@Test public void testShortSentence() throws Exception { Tokenizer tokenizer = createTokenizer( new StringReader("꽃배달 꽃망울 오토바이"), 2); assertEquals( "꽃:N:NNG:null:1:1:0:1,배달:N:NNG:null:1:1:1:3," + "꽃:N:NNG:null:1:1:4:5,꽃망울:COMPOUND:Compound:null:0:2:4:7," + "망울:N:NNG:null:1:1:5:7,오토바이:N:NNG:null:1:1:8:12,", tokenizerToString(tokenizer)); tokenizer.reset(); tokenizer.setReader(new StringReader("소설 무궁화꽃이 피었습니다.")); assertEquals( "소설:N:NNG:null:1:1:0:2,무궁:N:NNG:null:1:1:3:5," + "무궁화:COMPOUND:Compound:null:0:2:3:6,화:N:NNG:null:1:1:5:6," + "꽃이:EOJEOL:NNG+JKS:null:1:1:6:8,꽃:N:NNG:null:0:1:6:7," + "피었습니다:EOJEOL:VV+EP+EF:null:1:1:9:14,", tokenizerToString(tokenizer)); tokenizer.close(); }
Example 7
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testBigLookahead() throws Exception { StringBuilder b = new StringBuilder(); for(int i=0;i<100;i++) { b.append('a'); } b.append('b'); Tokenizer t = new SimplePatternTokenizer(b.toString()); b = new StringBuilder(); for(int i=0;i<200;i++) { b.append('a'); } t.setReader(new StringReader(b.toString())); t.reset(); assertFalse(t.incrementToken()); }
Example 8
Source File: TestOpenNLPTokenizerFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
@Test public void testClose() throws IOException { Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin"); put("tokenizerModel", "en-test-tokenizer.bin"); }}; OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args); factory.inform(new ClasspathResourceLoader(getClass())); Tokenizer ts = factory.create(newAttributeFactory()); ts.setReader(new StringReader(SENTENCES)); ts.reset(); ts.close(); ts.reset(); ts.setReader(new StringReader(SENTENCES)); assertTokenStreamContents(ts, SENTENCES_punc); ts.close(); ts.reset(); ts.setReader(new StringReader(SENTENCES)); assertTokenStreamContents(ts, SENTENCES_punc); }
Example 9
Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 10
Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLpTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 11
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEmptyStringPatternOneMatch() throws Exception { Tokenizer t = new SimplePatternTokenizer("a*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("bbab")); t.reset(); assertTrue(t.incrementToken()); assertEquals("a", termAtt.toString()); assertFalse(t.incrementToken()); }
Example 12
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEndOffset() throws Exception { Tokenizer t = new SimplePatternTokenizer("a+"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class); t.setReader(new StringReader("aaabbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("aaa", termAtt.toString()); assertFalse(t.incrementToken()); t.end(); assertEquals(6, offsetAtt.endOffset()); }
Example 13
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testOneToken() throws Exception { Tokenizer t = new SimplePatternTokenizer(".*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); String s; while (true) { s = TestUtil.randomUnicodeString(random()); if (s.length() > 0) { break; } } t.setReader(new StringReader(s)); t.reset(); assertTrue(t.incrementToken()); assertEquals(s, termAtt.toString()); }
Example 14
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNoTokens() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer(".*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); String s; while (true) { s = TestUtil.randomUnicodeString(random()); if (s.length() > 0) { break; } } t.setReader(new StringReader(s)); t.reset(); assertFalse(t.incrementToken()); }
Example 15
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEmptyStringPatternNoMatch() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("bbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("bbb", termAtt.toString()); assertFalse(t.incrementToken()); }
Example 16
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEndOffset() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a+"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class); t.setReader(new StringReader("aaabbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("bbb", termAtt.toString()); assertFalse(t.incrementToken()); t.end(); assertEquals(6, offsetAtt.endOffset()); }
Example 17
Source File: HanLPTokenizerFactoryTest.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 18
Source File: URLTokenizer.java From elasticsearch-analysis-url with Apache License 2.0 | 5 votes |
/** * Get a list of {@link Token}s from the given {@link Tokenizer} * @param part the url part which should be used in {@link Token} creation * @param tokenizer the tokenizer from which tokens will be gleaned * @return a list of tokens * @throws IOException */ private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException { tokenizer.reset(); List<Token> tokens = new ArrayList<>(); OffsetAttribute offset; String token; while (tokenizer.incrementToken()) { token = tokenizer.getAttribute(CharTermAttribute.class).toString(); offset = tokenizer.getAttribute(OffsetAttribute.class); tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset())); } return tokens; }
Example 19
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testEmptyStringPatternNoMatch() throws Exception { Tokenizer t = new SimplePatternTokenizer("a*"); t.setReader(new StringReader("bbb")); t.reset(); assertFalse(t.incrementToken()); }
Example 20
Source File: NGramTokenizerTest.java From lucene-solr with Apache License 2.0 | 4 votes |
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException { // convert the string to code points final int[] codePoints = toCodePoints(s); final int[] offsets = new int[codePoints.length + 1]; for (int i = 0; i < codePoints.length; ++i) { offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]); } final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) { @Override protected boolean isTokenChar(int chr) { return nonTokenChars.indexOf(chr) < 0; } }; grams.setReader(new StringReader(s)); final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class); final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class); grams.reset(); for (int start = 0; start < codePoints.length; ++start) { nextGram: for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) { if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) { // not on an edge continue nextGram; } for (int j = start; j < end; ++j) { if (!isTokenChar(nonTokenChars, codePoints[j])) { continue nextGram; } } assertTrue(grams.incrementToken()); assertArrayEquals(ArrayUtil.copyOfSubArray(codePoints, start, end), toCodePoints(termAtt)); assertEquals(1, posIncAtt.getPositionIncrement()); assertEquals(1, posLenAtt.getPositionLength()); assertEquals(offsets[start], offsetAtt.startOffset()); assertEquals(offsets[end], offsetAtt.endOffset()); } } assertFalse(grams.incrementToken()); grams.end(); assertEquals(s.length(), offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); }