org.apache.lucene.analysis.tokenattributes.OffsetAttribute#endOffset

Source File: PlainHighlighter.java From Elasticsearch with Apache License 2.0

6 votes

private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}

Source File: ConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

6 votes

@Override
public boolean incrementToken() throws IOException {
  boolean newSource = false;
  while (sources[currentSource].incrementToken() == false) {
    if (currentSource >= sources.length - 1)
      return false;
    sources[currentSource].end();
    initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
    OffsetAttribute att = sourceOffsets[currentSource];
    if (att != null)
      offsetIncrement += att.endOffset();
    currentSource++;
    newSource = true;
  }

  clearAttributes();
  sources[currentSource].copyTo(this);
  offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
  if (newSource) {
    int posInc = posIncAtt.getPositionIncrement();
    posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
  }

  return true;
}

Source File: LuceneUtil.java From jasperreports with GNU Lesser General Public License v3.0

6 votes

protected String displayTokens(String text, String elementId) throws IOException {
	Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);;
	StringBuilder sb = new StringBuilder();
	sb.append(elementId).append(": ").append(text).append(": ");

	TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
	CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);

	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		int startOffset = offsetAttribute.startOffset();
		int endOffset = offsetAttribute.endOffset();
		String term = charTermAttribute.toString();
		sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
	}

	return sb.toString();
}

Source File: AutoPhrasingTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

private void emit(char[] tokenChars) {
    char[] token = tokenChars;
    if (replaceWhitespaceWith != null) {
        token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    if (termAttr != null) {
        termAttr.setEmpty();
        termAttr.append(new StringBuilder().append(token));
    }
    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
        int start = offAttr.endOffset() - token.length;
        offAttr.setOffset(start, offAttr.endOffset());
    }
    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
        pia.setPositionIncrement(++positionIncr);
    }
    lastEmitted = token;
}

Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0

5 votes

private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
    List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
    int lastPosition = -1;
    int lastOffset = 0;
    for (String text : request.text()) {
        try (TokenStream stream = analyzer.tokenStream(field, text)) {
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    lastPosition = lastPosition + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), null));

            }
            stream.end();
            lastOffset += offset.endOffset();
            lastPosition += posIncr.getPositionIncrement();

            lastPosition += analyzer.getPositionIncrementGap(field);
            lastOffset += analyzer.getOffsetGap(field);
        } catch (IOException e) {
            throw new ElasticsearchException("failed to analyze", e);
        }
    }
    return tokens;
}

Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0

5 votes

private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}

Source File: XmlInterpolationTest.java From lucene-solr with Apache License 2.0

5 votes

private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}

Source File: DecompoundTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

DecompoundToken(CharSequence value, CharTermAttribute termAttribute, OffsetAttribute offsetAttribute) {
    this.value = value;
    if (offsetAttribute.endOffset() - offsetAttribute.startOffset() != termAttribute.length()) {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.endOffset();
    } else {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.startOffset() + termAttribute.length();
    }
}

Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0

5 votes

private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes, boolean shortAttrName) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}

Source File: XmlInterpolationTest.java From SolrTextTagger with Apache License 2.0

5 votes

private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}

Source File: MemoryIndex.java From lucene-solr with Apache License 2.0

4 votes

private void storeTerms(Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {

    int pos = -1;
    int offset = 0;
    if (info.numTokens > 0) {
      pos = info.lastPosition + positionIncrementGap;
      offset = info.lastOffset + offsetGap;
    }

    try (TokenStream stream = tokenStream) {
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
      stream.reset();

      while (stream.incrementToken()) {
//        if (DEBUG) System.err.println("token='" + term + "'");
        info.numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0) {
          info.numOverlapTokens++;
        }
        pos += posIncr;
        int ord = info.terms.add(termAtt.getBytesRef());
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(info.sliceArray.end[ord]);
        } else {
          info.sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        info.sliceArray.freq[ord]++;
        info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
        info.sumTotalTermFreq++;
        postingsWriter.writeInt(pos);
        if (storeOffsets) {
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        if (storePayloads) {
          final BytesRef payload = payloadAtt.getPayload();
          final int pIndex;
          if (payload == null || payload.length == 0) {
            pIndex = -1;
          } else {
            pIndex = payloadsBytesRefs.append(payload);
          }
          postingsWriter.writeInt(pIndex);
        }
        info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();
      if (info.numTokens > 0) {
        info.lastPosition = pos;
        info.lastOffset = offsetAtt.endOffset() + offset;
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

Java Code Examples for org.apache.lucene.analysis.tokenattributes.OffsetAttribute#endOffset()