org.apache.lucene.index.PostingsEnum#nextPosition

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}

Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns a new term vector entry representing the specified term, and optionally, positions.
 *
 * @param te - positioned terms iterator
 * @return term vector entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorEntry of(TermsEnum te) throws IOException {
  Objects.requireNonNull(te);

  String termText = BytesRefUtils.decode(te.term());

  List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  pe.nextDoc();
  int freq = pe.freq();
  for (int i = 0; i < freq; i++) {
    int pos = pe.nextPosition();
    if (pos < 0) {
      // no position information available
      continue;
    }
    TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
    tvPositions.add(tvPos);
  }

  return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}

Source File: BlendedInfixSuggester.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Create the coefficient to transform the weight.
 *
 * @param doc id of the document
 * @param matchedTokens tokens found in the query
 * @param prefixToken unfinished token in the query
 * @return the coefficient
 * @throws IOException If there are problems reading term vectors from the underlying Lucene index.
 */
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {

  Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
  TermsEnum it = tv.iterator();

  Integer position = Integer.MAX_VALUE;
  BytesRef term;
  // find the closest token position
  while ((term = it.next()) != null) {

    String docTerm = term.utf8ToString();

    if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
 
      PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
      docPosEnum.nextDoc();

      // use the first occurrence of the term
      int p = docPosEnum.nextPosition();
      if (p < position) {
        position = p;
      }
    }
  }

  // create corresponding coefficient based on position
  return calculateCoefficient(position);
}

Source File: TaxonomyIndexArrays.java From lucene-solr with Apache License 2.0

5 votes

private void initParents(IndexReader reader, int first) throws IOException {
  if (reader.maxDoc() == first) {
    return;
  }
  
  // it's ok to use MultiTerms because we only iterate on one posting list.
  // breaking it to loop over the leaves() only complicates code for no
  // apparent gain.
  PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader,
      Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
      PostingsEnum.PAYLOADS);

  // shouldn't really happen, if it does, something's wrong
  if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
    throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
  }
  
  int num = reader.maxDoc();
  for (int i = first; i < num; i++) {
    if (positions.docID() == i) {
      if (positions.freq() == 0) { // shouldn't happen
        throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
      }
      
      parents[i] = positions.nextPosition();
      
      if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
        if (i + 1 < num) {
          throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString());
        }
        break;
      }
    } else { // this shouldn't happen
      throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
    }
  }
}

Source File: TestTeeSinkTokenFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
  Document doc = new Document();
  TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
  TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
  TokenStream sink = tee.newSinkTokenStream();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
  Field f1 = new Field("field", tee, ft);
  Field f2 = new Field("field", sink, ft);
  doc.add(f1);
  doc.add(f2);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  Terms vector = r.getTermVectors(0).terms("field");
  assertEquals(1, vector.size());
  TermsEnum termsEnum = vector.iterator();
  termsEnum.next();
  assertEquals(2, termsEnum.totalTermFreq());
  PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
  assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(2, positions.freq());
  positions.nextPosition();
  assertEquals(0, positions.startOffset());
  assertEquals(4, positions.endOffset());
  positions.nextPosition();
  assertEquals(8, positions.startOffset());
  assertEquals(12, positions.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
  r.close();
  dir.close();
  analyzer.close();
}

Source File: ESIndex.java From pyramid with Apache License 2.0

5 votes

private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
            .setOffsets(false).setPositions(true).setFieldStatistics(false)
            .setTermStatistics(false)
            .setSelectedFields(field).
                    execute().actionGet();

    Map<Integer,String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms==null){
        return map;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;
    
    for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
    	String term = termBytes.utf8ToString();
    	
    	postings = iterator.postings(postings, PostingsEnum.ALL);
    	
    	//there can only be one doc since we are getting with id. get the doc and the position 
    	postings.nextDoc();
    	
    	int tf = postings.freq();
    	
    	for (int i = 0; i < tf; i++) {
    		int pos = postings.nextPosition();
            map.put(pos,term);
    	}
    	
    }
    
    return map;
}

Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

5 votes

private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Source File: WindowBuildingTVM.java From wiseowl with MIT License

4 votes

public void map(Terms terms,Spans spans) throws IOException {
	int primStart = spanStart - primaryWS;
    int primEnd = spanEnd + primaryWS;
    // stores the start and end of the adjacent previous and following
    int adjLBStart = primStart - adjWS;
    int adjLBEnd = primStart - 1;//don't overlap
    int adjUBStart = primEnd + 1;//don't overlap
    int adjUBEnd = primEnd + adjWS;
    //stores the start and end of the secondary previous and the secondary following
    int secLBStart = adjLBStart - secWS;
    int secLBEnd = adjLBStart - 1; //don't overlap the adjacent window
    int secUBStart = adjUBEnd + 1;
    int secUBEnd = adjUBEnd + secWS;
    WindowTerm lastWT = null;
    if(terms!=null)
    {}
	TermsEnum termsEnum = terms.iterator();
    BytesRef termref = null;
    String term=null;
    
    while ((termref = termsEnum.next()) != null) {
  	term=termsEnum.term().utf8ToString();
  	PostingsEnum postings = termsEnum.postings(null, PostingsEnum.PAYLOADS | PostingsEnum.OFFSETS);
  	postings.nextDoc();
  if (term.startsWith(NameFilter.NE_PREFIX) == false && term.startsWith(PassageRankingComponent.NE_PREFIX_LOWER) == false) {//filter out the types, as we don't need them here
    //construct the windows, which means we need a bunch of 
	//bracketing variables to know what window we are in
    //start and end of the primary window
      //unfortunately, we still have to loop over the positions
      //we'll make this inclusive of the boundaries, do an upfront check here so
      //we can skip over anything that is outside of all windows
    	//int position=spans.nextStartPosition();
    	int position=postings.nextPosition();
      if (position >= secLBStart && position <= secUBEnd) {
        //fill in the windows
        WindowTerm wt;
        //offsets aren't required, but they are nice to have
        
        if (postings != null){
        //log.warn("terms if postings!=null {}",term);
        wt = new WindowTerm(term, position, postings.startOffset(), postings.endOffset());
        } else {
          wt = new WindowTerm(term, position);
          //log.warn("terms if postings==null {}",term);
        }
        
        if (position >= primStart && position <= primEnd) {//are we in the primary window
          passage.terms.add(wt);
          //we are only going to keep bigrams for the primary window.  You could do it for the other windows, too
          if (lastWT != null) {
            WindowTerm bigramWT = new WindowTerm(lastWT.term + "," + term, lastWT.position);//we don't care about offsets for bigrams
            passage.bigrams.add(bigramWT);
          }
          lastWT = wt;
        } else if (position >= secLBStart && position <= secLBEnd) {
        	//are we in the secondary previous window?
          passage.secPrevTerms.add(wt);
        } else if (position >= secUBStart && position <= secUBEnd) {//are we in the secondary following window?
          passage.secFollowTerms.add(wt);
        } else if (position >= adjLBStart && position <= adjLBEnd) {//are we in the adjacent previous window?
          passage.prevTerms.add(wt);
        } else if (position >= adjUBStart && position <= adjUBEnd) {//are we in the adjacent following window?
          passage.followTerms.add(wt);
        }
      }
    //}
  }}
}

Source File: VectorScoreQuery.java From solr-vector-scoring with Apache License 2.0

4 votes

@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}

Source File: PayloadFilteredTermIntervalsSource.java From lucene-solr with Apache License 2.0

4 votes

private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException {
  PostingsEnum pe = te.postings(null, PostingsEnum.ALL);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      do {
        if (upto <= 0) {
          pos = IntervalIterator.NO_MORE_INTERVALS;
          return false;
        }
        upto--;
        pos = pe.nextPosition();
      }
      while (filter.test(pe.getPayload()) == false);
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      throw new UnsupportedOperationException();
    }
  };
}

Source File: TermIntervalsSource.java From lucene-solr with Apache License 2.0

4 votes

static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
  TermQuery query = new TermQuery(new Term(field, te.term()));
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      if (upto <= 0) {
        pos = IntervalIterator.NO_MORE_INTERVALS;
        return false;
      }
      upto--;
      pos = pe.nextPosition();
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      return query;
    }
  };
}

Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0

4 votes

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}

Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

4 votes

private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Java Code Examples for org.apache.lucene.index.PostingsEnum#nextPosition()