org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement

Source File: SimpleSynonymMap.java From elasticsearch-dynamic-synonym with Apache License 2.0

6 votes

private Set<String> analyze(String text) throws IOException {
    Set<String> result = new HashSet<String>();
    Analyzer analyzer = configuration.getAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            if (posIncAtt.getPositionIncrement() != 1) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
            }

            result.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }

        ts.end();
        return result;
    }
}

Source File: StringFieldType.java From crate with Apache License 2.0

6 votes

@Override
public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException {

    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.setSlop(slop);

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
    int position = -1;

    stream.reset();
    while (stream.incrementToken()) {
        if (enablePosIncrements) {
            position += posIncrAtt.getPositionIncrement();
        } else {
            position += 1;
        }
        builder.add(new Term(field, termAtt.getBytesRef()), position);
    }

    return builder.build();
}

Source File: ShingleAnalyzerWrapperTest.java From lucene-solr with Apache License 2.0

6 votes

public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
    int j = -1;
  
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.toString();
      builder.add(new Term("content", termText), j);
    }
    ts.end();
  }

  PhraseQuery q = builder.build();
  ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs;
  int[] ranks = new int[] { 0 };
  compareRanks(hits, ranks);
}

Source File: QueryBuilder.java From lucene-solr with Apache License 2.0

6 votes

/** 
 * Creates complex boolean query from the cached tokenstream contents 
 */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
  BooleanQuery.Builder q = newBooleanQuery();
  List<TermAndBoost> currentQuery = new ArrayList<>();
  
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);

  stream.reset();
  while (stream.incrementToken()) {
    if (posIncrAtt.getPositionIncrement() != 0) {
      add(q, currentQuery, operator);
      currentQuery.clear();
    }
    currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
  }
  add(q, currentQuery, operator);
  
  return q.build();
}

Source File: QueryBuilder.java From lucene-solr with Apache License 2.0

6 votes

/** 
 * Creates simple phrase query from the cached tokenstream contents 
 */
protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException {
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  builder.setSlop(slop);
  
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  int position = -1;
  float phraseBoost = DEFAULT_BOOST;
  stream.reset();
  while (stream.incrementToken()) {
    if (enablePositionIncrements) {
      position += posIncrAtt.getPositionIncrement();
    } else {
      position += 1;
    }
    builder.add(new Term(field, termAtt.getBytesRef()), position);
    phraseBoost *= boostAtt.getBoost();
  }
  PhraseQuery query = builder.build();
  if (phraseBoost == DEFAULT_BOOST) {
    return query;
  }
  return new BoostQuery(query, phraseBoost);
}

Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0

6 votes

private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
            lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}

Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}

Source File: QueryBuilder.java From lucene-solr with Apache License 2.0

5 votes

/** 
 * Creates complex phrase query from the cached tokenstream contents 
 */
protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException {
  MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder();
  mpqb.setSlop(slop);
  
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);

  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  int position = -1;  
  
  List<Term> multiTerms = new ArrayList<>();
  stream.reset();
  while (stream.incrementToken()) {
    int positionIncrement = posIncrAtt.getPositionIncrement();
    
    if (positionIncrement > 0 && multiTerms.size() > 0) {
      if (enablePositionIncrements) {
        mpqb.add(multiTerms.toArray(new Term[0]), position);
      } else {
        mpqb.add(multiTerms.toArray(new Term[0]));
      }
      multiTerms.clear();
    }
    position += positionIncrement;
    multiTerms.add(new Term(field, termAtt.getBytesRef()));
  }
  
  if (enablePositionIncrements) {
    mpqb.add(multiTerms.toArray(new Term[0]), position);
  } else {
    mpqb.add(multiTerms.toArray(new Term[0]));
  }
  return mpqb.build();
}

Source File: TextFieldMapper.java From crate with Apache License 2.0

5 votes

private static boolean hasGaps(CachingTokenFilter stream) throws IOException {
    PositionIncrementAttribute posIncAtt = stream.getAttribute(PositionIncrementAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        if (posIncAtt.getPositionIncrement() > 1) {
            return true;
        }
    }
    return false;
}

Source File: StringFieldType.java From crate with Apache License 2.0

5 votes

@Override
public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {

    MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
    mpqb.setSlop(slop);

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);

    PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
    int position = -1;

    List<Term> multiTerms = new ArrayList<>();
    stream.reset();
    while (stream.incrementToken()) {
        int positionIncrement = posIncrAtt.getPositionIncrement();

        if (positionIncrement > 0 && multiTerms.size() > 0) {
            if (enablePositionIncrements) {
                mpqb.add(multiTerms.toArray(new Term[0]), position);
            } else {
                mpqb.add(multiTerms.toArray(new Term[0]));
            }
            multiTerms.clear();
        }
        position += positionIncrement;
        multiTerms.add(new Term(field, termAtt.getBytesRef()));
    }

    if (enablePositionIncrements) {
        mpqb.add(multiTerms.toArray(new Term[0]), position);
    } else {
        mpqb.add(multiTerms.toArray(new Term[0]));
    }
    return mpqb.build();
}

Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0

5 votes

private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes, boolean shortAttrName) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}

Source File: TokenCountFieldMapper.java From Elasticsearch with Apache License 2.0

5 votes

/**
 * Count position increments in a token stream.  Package private for testing.
 * @param analyzer analyzer to create token stream
 * @param fieldName field name to pass to analyzer
 * @param fieldValue field value to pass to analyzer
 * @return number of position increments in a token stream
 * @throws IOException if tokenStream throws it
 */
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
        int count = 0;
        PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            count += position.getPositionIncrement();
        }
        tokenStream.end();
        count += position.getPositionIncrement();
        return count;
    }
}

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

5 votes

/** Sugar: analyzes the text with the analyzer and
 *  separates by {@link SynonymMap#WORD_SEPARATOR}.
 *  reuse and its chars must not be null. */
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
  try (TokenStream ts = analyzer.tokenStream("", text)) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();
    reuse.clear();
    while (ts.incrementToken()) {
      int length = termAtt.length();
      if (length == 0) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
      }
      if (posIncAtt.getPositionIncrement() != 1) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                                           ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
      }
      reuse.grow(reuse.length() + length + 1); /* current + word + separator */
      int end = reuse.length();
      if (reuse.length() > 0) {
        reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
        reuse.setLength(reuse.length() + 1);
      }
      System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
      reuse.setLength(reuse.length() + length);
    }
    ts.end();
  }
  if (reuse.length() == 0) {
    throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
  }
  return reuse.get();
}

Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0

5 votes

private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}

Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0

5 votes

private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
    List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
    int lastPosition = -1;
    int lastOffset = 0;
    for (String text : request.text()) {
        try (TokenStream stream = analyzer.tokenStream(field, text)) {
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    lastPosition = lastPosition + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), null));

            }
            stream.end();
            lastOffset += offset.endOffset();
            lastPosition += posIncr.getPositionIncrement();

            lastPosition += analyzer.getPositionIncrementGap(field);
            lastOffset += analyzer.getOffsetGap(field);
        } catch (IOException e) {
            throw new ElasticsearchException("failed to analyze", e);
        }
    }
    return tokens;
}

Source File: GraphTokenStreamFiniteStrings.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Build an automaton from the provided {@link TokenStream}.
 */
private Automaton build(final TokenStream in) throws IOException {
  Automaton.Builder builder = new Automaton.Builder();

  final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);

  in.reset();

  int pos = -1;
  int prevIncr = 1;
  int state = -1;
  int id = -1;
  int gap = 0;
  while (in.incrementToken()) {
    int currentIncr = posIncAtt.getPositionIncrement();
    if (pos == -1 && currentIncr < 1) {
      throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
    }

    if (currentIncr == 0) {
      if (gap > 0) {
        pos -= gap;
      }
    }
    else {
      pos++;
      gap = currentIncr - 1;
    }

    int endPos = pos + posLengthAtt.getPositionLength() + gap;
    while (state < endPos) {
      state = builder.createState();
    }

    id++;
    if (tokens.length < id + 1) {
      tokens = ArrayUtil.grow(tokens, id + 1);
    }

    tokens[id] = in.cloneAttributes();
    builder.addTransition(pos, endPos, id);
    pos += gap;

    // we always produce linear token graphs from getFiniteStrings(), so we need to adjust
    // posLength and posIncrement accordingly
    tokens[id].addAttribute(PositionLengthAttribute.class).setPositionLength(1);
    if (currentIncr == 0) {
      // stacked token should have the same increment as original token at this position
      tokens[id].addAttribute(PositionIncrementAttribute.class).setPositionIncrement(prevIncr);
    }

    // only save last increment on non-zero increment in case we have multiple stacked tokens
    if (currentIncr > 0) {
      prevIncr = currentIncr;
    }
  }

  in.end();
  if (state != -1) {
    builder.setAccept(state, true);
  }
  return builder.finish();
}

Source File: TermSubQueryBuilder.java From querqy with Apache License 2.0

4 votes

public TermSubQueryFactory termToFactory(final String fieldname, final Term sourceTerm, final FieldBoost boost)
        throws IOException {
    
    final CacheKey cacheKey;

    if (termQueryCache != null) {
        
        cacheKey = new CacheKey(fieldname, sourceTerm);
       
        TermQueryCacheValue cacheValue = termQueryCache.get(cacheKey);
        if (cacheValue != null) {
            // The cache references factories with pre-analyzed terms, or cache entries without a
            // query factory if the term does not exist in the index. cacheValue.hasQuery() returns
            // true/false correspondingly.
            // Cache entries don't have a boost factor, it is only added later via the queryFactory.
            return (cacheValue.hasQuery()) ? new TermSubQueryFactory(cacheValue, boost) : null;
        } 
        
    } else {
        cacheKey = null;
    }
    
    final LuceneQueryFactoryAndPRMSQuery root;
    TokenStream ts = null;
    try {
       
       ts = analyzer.tokenStream(fieldname, new CharSequenceReader(sourceTerm));
       final CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
       final PositionIncrementAttribute posIncAttr = ts.addAttribute(PositionIncrementAttribute.class);
       ts.reset();
     
       final PositionSequence<org.apache.lucene.index.Term> sequence = new PositionSequence<>();
       while (ts.incrementToken()) {
          
           final int inc = posIncAttr.getPositionIncrement();
           if (inc > 0 || sequence.isEmpty()) {
               sequence.nextPosition();
           }

           sequence.addElement(new org.apache.lucene.index.Term(fieldname, new BytesRef(termAttr)));
       }
       
       root = positionSequenceToQueryFactoryAndPRMS(sequence);

    } finally {
       if (ts != null) {
           try {
               ts.close();
           } catch (IOException e) {
           }
       }
    }

    putQueryFactoryAndPRMSQueryIntoCache(cacheKey, root);
    
    return root == null ? null : new TermSubQueryFactory(root, boost);
}

Source File: MemoryIndex.java From lucene-solr with Apache License 2.0

4 votes

private void storeTerms(Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {

    int pos = -1;
    int offset = 0;
    if (info.numTokens > 0) {
      pos = info.lastPosition + positionIncrementGap;
      offset = info.lastOffset + offsetGap;
    }

    try (TokenStream stream = tokenStream) {
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
      stream.reset();

      while (stream.incrementToken()) {
//        if (DEBUG) System.err.println("token='" + term + "'");
        info.numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0) {
          info.numOverlapTokens++;
        }
        pos += posIncr;
        int ord = info.terms.add(termAtt.getBytesRef());
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(info.sliceArray.end[ord]);
        } else {
          info.sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        info.sliceArray.freq[ord]++;
        info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
        info.sumTotalTermFreq++;
        postingsWriter.writeInt(pos);
        if (storeOffsets) {
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        if (storePayloads) {
          final BytesRef payload = payloadAtt.getPayload();
          final int pIndex;
          if (payload == null || payload.length == 0) {
            pIndex = -1;
          } else {
            pIndex = payloadsBytesRefs.append(payload);
          }
          postingsWriter.writeInt(pIndex);
        }
        info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();
      if (info.numTokens > 0) {
        info.lastPosition = pos;
        info.lastOffset = offsetAtt.endOffset() + offset;
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

Java Code Examples for org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement()