Java Code Examples for org.apache.lucene.analysis.Tokenizer#getAttribute()
The following examples show how to use
org.apache.lucene.analysis.Tokenizer#getAttribute() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLpTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 2
Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 3
Source File: NlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testSegmenter() throws Exception { Tokenizer segmenter = getSegmenter(); String text = "中华人民共和国(People's Republic of China),简称'中国'"; segmenter.setReader(new StringReader(text)); segmenter.reset(); while (segmenter.incrementToken()) { // 词元 CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute type = segmenter.getAttribute(TypeAttribute.class); LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset())); Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase()); } }
Example 4
Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 5
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testBigLookahead() throws Exception { StringBuilder b = new StringBuilder(); for(int i=0;i<100;i++) { b.append('a'); } b.append('b'); Tokenizer t = new SimplePatternSplitTokenizer(b.toString()); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); b = new StringBuilder(); for(int i=0;i<200;i++) { b.append('a'); } t.setReader(new StringReader(b.toString())); t.reset(); assertTrue(t.incrementToken()); assertEquals(b.toString(), termAtt.toString()); assertFalse(t.incrementToken()); }
Example 6
Source File: URLTokenizer.java From elasticsearch-analysis-url with Apache License 2.0 | 5 votes |
/** * Get a list of {@link Token}s from the given {@link Tokenizer} * @param part the url part which should be used in {@link Token} creation * @param tokenizer the tokenizer from which tokens will be gleaned * @return a list of tokens * @throws IOException */ private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException { tokenizer.reset(); List<Token> tokens = new ArrayList<>(); OffsetAttribute offset; String token; while (tokenizer.incrementToken()) { token = tokenizer.getAttribute(CharTermAttribute.class).toString(); offset = tokenizer.getAttribute(OffsetAttribute.class); tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset())); } return tokens; }
Example 7
Source File: HanLPTokenizerFactoryTest.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example 8
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEndOffset() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a+"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class); t.setReader(new StringReader("aaabbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("bbb", termAtt.toString()); assertFalse(t.incrementToken()); t.end(); assertEquals(6, offsetAtt.endOffset()); }
Example 9
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEmptyStringPatternOneMatch() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("bbab")); assertTokenStreamContents(t, new String[] {"bb", "b"}, new int[] {0, 3}, new int[] {2, 4}); }
Example 10
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testTrailingNonToken() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("a c ")); assertTokenStreamContents(t, new String[] {"a", "c"}, new int[] {0, 2}, new int[] {1, 3}); }
Example 11
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testLeadingNonToken() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader(" a c")); assertTokenStreamContents(t, new String[] {"a", "c"}, new int[] {4, 6}, new int[] {5, 7}); }
Example 12
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testSplitMultiCharWhitespace() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("a \tb c")); assertTokenStreamContents(t, new String[] {"a", "b", "c"}, new int[] {0, 3, 7}, new int[] {1, 4, 8}); }
Example 13
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testSplitSingleCharWhitespace() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("a \tb c")); assertTokenStreamContents(t, new String[] {"a", "b", "c"}, new int[] {0, 3, 7}, new int[] {1, 4, 8}); }
Example 14
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEmptyStringPatternNoMatch() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer("a*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("bbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("bbb", termAtt.toString()); assertFalse(t.incrementToken()); }
Example 15
Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNoTokens() throws Exception { Tokenizer t = new SimplePatternSplitTokenizer(".*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); String s; while (true) { s = TestUtil.randomUnicodeString(random()); if (s.length() > 0) { break; } } t.setReader(new StringReader(s)); t.reset(); assertFalse(t.incrementToken()); }
Example 16
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEndOffset() throws Exception { Tokenizer t = new SimplePatternTokenizer("a+"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class); t.setReader(new StringReader("aaabbb")); t.reset(); assertTrue(t.incrementToken()); assertEquals("aaa", termAtt.toString()); assertFalse(t.incrementToken()); t.end(); assertEquals(6, offsetAtt.endOffset()); }
Example 17
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEmptyStringPatternOneMatch() throws Exception { Tokenizer t = new SimplePatternTokenizer("a*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.setReader(new StringReader("bbab")); t.reset(); assertTrue(t.incrementToken()); assertEquals("a", termAtt.toString()); assertFalse(t.incrementToken()); }
Example 18
Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testOneToken() throws Exception { Tokenizer t = new SimplePatternTokenizer(".*"); CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); String s; while (true) { s = TestUtil.randomUnicodeString(random()); if (s.length() > 0) { break; } } t.setReader(new StringReader(s)); t.reset(); assertTrue(t.incrementToken()); assertEquals(s, termAtt.toString()); }