org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute Java Examples
The following examples show how to use
org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TokenStreamToDot.java From lucene-solr with Apache License 2.0 | 5 votes |
/** If inputText is non-null, and the TokenStream has * offsets, we include the surface form in each arc's * label. */ public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) { this.in = in; this.out = out; this.inputText = inputText; termAtt = in.addAttribute(CharTermAttribute.class); posIncAtt = in.addAttribute(PositionIncrementAttribute.class); posLengthAtt = in.addAttribute(PositionLengthAttribute.class); if (in.hasAttribute(OffsetAttribute.class)) { offsetAtt = in.addAttribute(OffsetAttribute.class); } else { offsetAtt = null; } }
Example #2
Source File: MeCabKoTokenizer.java From mecab-ko-lucene-analyzer with Apache License 2.0 | 5 votes |
private void setAttributes() { charTermAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); posLenAtt = addAttribute(PositionLengthAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); posAtt = addAttribute(PartOfSpeechAttribute.class); semanticClassAtt = addAttribute(SemanticClassAttribute.class); }
Example #3
Source File: NGramTokenizerTest.java From lucene-solr with Apache License 2.0 | 4 votes |
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException { // convert the string to code points final int[] codePoints = toCodePoints(s); final int[] offsets = new int[codePoints.length + 1]; for (int i = 0; i < codePoints.length; ++i) { offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]); } final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) { @Override protected boolean isTokenChar(int chr) { return nonTokenChars.indexOf(chr) < 0; } }; grams.setReader(new StringReader(s)); final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class); final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class); grams.reset(); for (int start = 0; start < codePoints.length; ++start) { nextGram: for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) { if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) { // not on an edge continue nextGram; } for (int j = start; j < end; ++j) { if (!isTokenChar(nonTokenChars, codePoints[j])) { continue nextGram; } } assertTrue(grams.incrementToken()); assertArrayEquals(ArrayUtil.copyOfSubArray(codePoints, start, end), toCodePoints(termAtt)); assertEquals(1, posIncAtt.getPositionIncrement()); assertEquals(1, posLenAtt.getPositionLength()); assertEquals(offsets[start], offsetAtt.startOffset()); assertEquals(offsets[end], offsetAtt.endOffset()); } } assertFalse(grams.incrementToken()); grams.end(); assertEquals(s.length(), offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); }
Example #4
Source File: GraphTokenFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
Token(AttributeSource attSource) { this.attSource = attSource; this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class); boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class); this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null; }
Example #5
Source File: GraphTokenStreamFiniteStrings.java From lucene-solr with Apache License 2.0 | 4 votes |
/** * Build an automaton from the provided {@link TokenStream}. */ private Automaton build(final TokenStream in) throws IOException { Automaton.Builder builder = new Automaton.Builder(); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); in.reset(); int pos = -1; int prevIncr = 1; int state = -1; int id = -1; int gap = 0; while (in.incrementToken()) { int currentIncr = posIncAtt.getPositionIncrement(); if (pos == -1 && currentIncr < 1) { throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1"); } if (currentIncr == 0) { if (gap > 0) { pos -= gap; } } else { pos++; gap = currentIncr - 1; } int endPos = pos + posLengthAtt.getPositionLength() + gap; while (state < endPos) { state = builder.createState(); } id++; if (tokens.length < id + 1) { tokens = ArrayUtil.grow(tokens, id + 1); } tokens[id] = in.cloneAttributes(); builder.addTransition(pos, endPos, id); pos += gap; // we always produce linear token graphs from getFiniteStrings(), so we need to adjust // posLength and posIncrement accordingly tokens[id].addAttribute(PositionLengthAttribute.class).setPositionLength(1); if (currentIncr == 0) { // stacked token should have the same increment as original token at this position tokens[id].addAttribute(PositionIncrementAttribute.class).setPositionIncrement(prevIncr); } // only save last increment on non-zero increment in case we have multiple stacked tokens if (currentIncr > 0) { prevIncr = currentIncr; } } in.end(); if (state != -1) { builder.setAccept(state, true); } return builder.finish(); }