org.apache.lucene.analysis.tokenattributes.TypeAttribute Java Examples
The following examples show how to use
org.apache.lucene.analysis.tokenattributes.TypeAttribute.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestNumericTokenStream.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testIntStream() throws Exception { @SuppressWarnings("resource") final LegacyNumericTokenStream stream=new LegacyNumericTokenStream().setIntValue(ivalue); final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); assertNotNull(bytesAtt); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); assertNotNull(typeAtt); final LegacyNumericTokenStream.LegacyNumericTermAttribute numericAtt = stream.getAttribute(LegacyNumericTokenStream.LegacyNumericTermAttribute.class); assertNotNull(numericAtt); stream.reset(); assertEquals(32, numericAtt.getValueSize()); for (int shift=0; shift<32; shift+= LegacyNumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), LegacyNumericUtils.prefixCodedToInt(bytesAtt.getBytesRef())); assertEquals("Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals("Type incorrect", (shift == 0) ? LegacyNumericTokenStream.TOKEN_TYPE_FULL_PREC : LegacyNumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
Example #2
Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #3
Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #4
Source File: PathTokenFilterTest.java From SearchServices with GNU Lesser General Public License v3.0 | 6 votes |
public void testAttributesAfterStreamEnd() throws IOException { final String path = "uri1:one"; StringReader reader = new StringReader(path); PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR, PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT, PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true); ts.setReader(reader); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); // PathTokenFilter.end() will be called after all tokens consumed. tokenise(ts, new String[]{"uri1", "one"}); // Check attributes cleaned up assertEquals("", termAtt.toString()); assertEquals("word", typeAtt.type()); // the default assertEquals(0, posIncAtt.getPositionIncrement()); // Final offset... assertEquals(path.length(), offsetAtt.startOffset()); assertEquals(path.length(), offsetAtt.endOffset()); }
Example #5
Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
public void testPinyinTokenFilter() throws Exception { Map<String, String> args = new HashMap<>(); args.put("original", "true"); args.put("pinyin", "false"); args.put("pinyinFirstChar", "true"); HanLPPinyinTokenFilterFactory factory = new HanLPPinyinTokenFilterFactory(args); TokenStream tokenStream = factory.create(tokenizer); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #6
Source File: HanLPIndexAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPIndexAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #7
Source File: URLTokenFilter.java From elasticsearch-analysis-url with Apache License 2.0 | 6 votes |
/** * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter} * will be passed along to the tokenizer. * @param input a string to be tokenized * @return a list of tokens extracted from the input string * @throws IOException */ private List<Token> tokenize(String input) throws IOException { List<Token> tokens = new ArrayList<>(); URLTokenizer tokenizer = new URLTokenizer(); // create a copy of the parts list to avoid ConcurrentModificationException when sorting tokenizer.setParts(new ArrayList<>(parts)); tokenizer.setUrlDecode(urlDeocde); tokenizer.setTokenizeHost(tokenizeHost); tokenizer.setTokenizePath(tokenizePath); tokenizer.setTokenizeQuery(tokenizeQuery); tokenizer.setAllowMalformed(allowMalformed || passthrough); tokenizer.setTokenizeMalformed(tokenizeMalformed); tokenizer.setReader(new StringReader(input)); tokenizer.reset(); String term; URLPart part; OffsetAttribute offset; while (tokenizer.incrementToken()) { term = tokenizer.getAttribute(CharTermAttribute.class).toString(); part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type()); offset = tokenizer.getAttribute(OffsetAttribute.class); tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset())); } return tokens; }
Example #8
Source File: ShingleFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testReset() throws Exception { Tokenizer wsTokenizer = new WhitespaceTokenizer(); wsTokenizer.setReader(new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); wsTokenizer.setReader(new StringReader("please divide this sentence")); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); }
Example #9
Source File: TestSnowball.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testFilterTokens() throws Exception { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class); FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class); filter.incrementToken(); assertEquals("accent", termAtt.toString()); assertEquals(2, offsetAtt.startOffset()); assertEquals(7, offsetAtt.endOffset()); assertEquals("wrd", typeAtt.type()); assertEquals(3, posIncAtt.getPositionIncrement()); assertEquals(77, flagsAtt.getFlags()); assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload()); }
Example #10
Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { final List<AttributeSource> tokens = new ArrayList<>(); final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); try { tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { position += posIncrAtt.getPositionIncrement(); trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } tokenStream.end(); // TODO should we capture? } catch (IOException ioe) { throw new RuntimeException("Error occurred while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }
Example #11
Source File: SpellingQueryConverter.java From lucene-solr with Apache License 2.0 | 6 votes |
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
Example #12
Source File: TokenStreamAssertions.java From elasticsearch-analysis-openkoreantext with Apache License 2.0 | 6 votes |
public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException { tokenStream.reset(); int index = 0; while (tokenStream.incrementToken() == true) { assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString()); if(expectedTypes != null) { assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type()); } OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class); if(expectedStartOffsets != null) { assertEquals(expectedStartOffsets[index], offsets.startOffset()); } if(expectedEndOffsets != null) { assertEquals(expectedEndOffsets[index], offsets.endOffset()); } index++; } tokenStream.end(); }
Example #13
Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0 | 6 votes |
private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException { List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>(); stream.reset(); //and each tokens output CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName))); } stream.end(); return tokens; }
Example #14
Source File: HanLpTokenizerTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testPinyinTokenFilter() throws Exception { Map<String, String> args = new HashMap<>(); args.put("original", "true"); args.put("pinyin", "false"); args.put("pinyinFirstChar", "true"); HanLpPinyinTokenFilterFactory factory = new HanLpPinyinTokenFilterFactory(args); TokenStream tokenStream = factory.create(tokenizer); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #15
Source File: TestNumericTokenStream.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testLongStream() throws Exception { @SuppressWarnings("resource") final LegacyNumericTokenStream stream=new LegacyNumericTokenStream().setLongValue(lvalue); final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); assertNotNull(bytesAtt); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); assertNotNull(typeAtt); final LegacyNumericTokenStream.LegacyNumericTermAttribute numericAtt = stream.getAttribute(LegacyNumericTokenStream.LegacyNumericTermAttribute.class); assertNotNull(numericAtt); stream.reset(); assertEquals(64, numericAtt.getValueSize()); for (int shift=0; shift<64; shift+= LegacyNumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), LegacyNumericUtils.prefixCodedToLong(bytesAtt.getBytesRef())); assertEquals("Term raw value is incorrectly encoded", lvalue & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals("Type incorrect", (shift == 0) ? LegacyNumericTokenStream.TOKEN_TYPE_FULL_PREC : LegacyNumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
Example #16
Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #17
Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); try (Analyzer analyzer = new HanLpQueryAnalyzer("viterbi")) { TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } } }
Example #18
Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLpTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #19
Source File: HanLpIndexAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); try (Analyzer analyzer = new HanLpIndexAnalyzer("viterbi")) { TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } } }
Example #20
Source File: NlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testSegmenter() throws Exception { Tokenizer segmenter = getSegmenter(); String text = "中华人民共和国(People's Republic of China),简称'中国'"; segmenter.setReader(new StringReader(text)); segmenter.reset(); while (segmenter.incrementToken()) { // 词元 CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute type = segmenter.getAttribute(TypeAttribute.class); LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset())); Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase()); } }
Example #21
Source File: LuceneUtil.java From antsdb with GNU Lesser General Public License v3.0 | 5 votes |
static void tokenize(String text, BiConsumer<String, String> lambda) { try (StandardAnalyzer analyzer = new StandardAnalyzer()) { TokenStream stream = analyzer.tokenStream("", text); CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); TypeAttribute type = stream.getAttribute(TypeAttribute.class); stream.reset(); while (stream.incrementToken()) { lambda.accept(type.type(), term.toString()); } } catch (IOException x) { throw new RuntimeException(x); } }
Example #22
Source File: BosonNLPTokenizer.java From elasticsearch-analysis-bosonnlp with Apache License 2.0 | 5 votes |
/** * Lucene constructor * * @throws UnirestException * @throws JSONException * @throws IOException */ public BosonNLPTokenizer(String URL, String BAT, int spaceMode, int oovLevel, int t2s, int specialCharConv) throws IOException, JSONException, UnirestException { super(); // Add token offset attribute offsetAttr = addAttribute(OffsetAttribute.class); // Add token content attribute charTermAttr = addAttribute(CharTermAttribute.class); // Add token type attribute typeAttr = addAttribute(TypeAttribute.class); // Add token position attribute piAttr = addAttribute(PositionIncrementAttribute.class); // Create a new word segmenter to get tokens BosonSeg = new BosonNLPWordSegmenter(input, URL, BAT, spaceMode, oovLevel, t2s, specialCharConv); }
Example #23
Source File: SimpleQueryConverter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public Collection<Token> convert(String origQuery) { Collection<Token> result = new HashSet<>(); try (WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream("", origQuery)) { // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { Token tok = new Token(); tok.copyBuffer(termAtt.buffer(), 0, termAtt.length()); tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); tok.setFlags(flagsAtt.getFlags()); tok.setPayload(payloadAtt.getPayload()); tok.setPositionIncrement(posIncAtt.getPositionIncrement()); tok.setType(typeAtt.type()); result.add(tok); } ts.end(); return result; } catch (IOException e) { throw new RuntimeException(e); } }
Example #24
Source File: HanLPTokenizerFactoryTest.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #25
Source File: TokenizerRunTest.java From KOMORAN with Apache License 2.0 | 5 votes |
@Test public void analyzeTest() throws Exception{ String testSource = "저는 이번에 바람과 함께 사라지다를 봤어요"; KomoranTokenizer tokenStream = new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE)); tokenStream.setReader(new StringReader(testSource)); tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println(tokenStream.getAttribute(CharTermAttribute.class)); System.out.println(tokenStream.getAttribute(TypeAttribute.class).type()); System.out.println(tokenStream.getAttribute(OffsetAttribute.class).startOffset() +" : "+tokenStream.getAttribute(OffsetAttribute.class).endOffset()); } tokenStream.end(); tokenStream.close(); }
Example #26
Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
public void testIncrementToken() throws Exception { while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #27
Source File: AnsjTokenizer.java From ansj4solr with Apache License 2.0 | 5 votes |
public AnsjTokenizer(Reader input, int analysisType, boolean removePunc) { super(input); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); positionIncrementAtt = addAttribute(PositionIncrementAttribute.class); this.analysisType = analysisType; this.removePunc = removePunc; }
Example #28
Source File: IKTokenizer.java From es-ik with Apache License 2.0 | 5 votes |
public IKTokenizer(Reader in, DictionaryConfiguration configuration) { super(in); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); _IKImplement = new IKSegmenter(input, configuration); }
Example #29
Source File: MMSegTokenizer.java From mmseg4j-solr with Apache License 2.0 | 5 votes |
public MMSegTokenizer(Seg seg) { this.seg = seg; termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); }
Example #30
Source File: CutLetterDigitFilter.java From mmseg4j-solr with Apache License 2.0 | 5 votes |
public CutLetterDigitFilter(TokenStream input) { super(input); reusableToken = new PackedTokenAttributeImpl(); termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); }