org.apache.lucene.index.TermsEnum#totalTermFreq

Source File: ReadmeSimilarityCalculator.java From scava with Eclipse Public License 2.0

6 votes

private DocVector[] getDocumentVectors() throws IOException {
	DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		docVector[docId] = new DocVector(getAllTerms());
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			int freq = (int) termsEnum.totalTermFreq();
			docVector[docId].setEntry(term, freq);
		}
		docVector[docId].normalize();
	}
	getIndexReader().close();
	return docVector;
}

Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns a new term vector entry representing the specified term, and optionally, positions.
 *
 * @param te - positioned terms iterator
 * @return term vector entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorEntry of(TermsEnum te) throws IOException {
  Objects.requireNonNull(te);

  String termText = BytesRefUtils.decode(te.term());

  List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  pe.nextDoc();
  int freq = pe.freq();
  for (int i = 0; i < freq; i++) {
    int pos = pe.nextPosition();
    if (pos < 0) {
      // no position information available
      continue;
    }
    TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
    tvPositions.add(tvPos);
  }

  return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}

Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param field2termFreqMap a Map of terms and their frequencies per field
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
  Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
  final TermsEnum termsEnum = vector.iterator();
  final CharsRefBuilder spare = new CharsRefBuilder();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    spare.copyUTF8Bytes(text);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}

Source File: DocToDoubleVectorUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms   term vectors for a given document
 * @param fieldTerms field term vectors
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
  TermsEnum fieldTermsEnum = fieldTerms.iterator();
  Double[] freqVector = null;
  if (docTerms != null && fieldTerms.size() > -1) {
    freqVector = new Double[(int) fieldTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();
    BytesRef term;
    while ((term = fieldTermsEnum.next()) != null) {
      TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
      if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
        docTermsEnum = docTerms.iterator();
      }
      if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
        long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
        freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      } else {
        freqVector[i] = 0d;
      }
      i++;
    }
  }
  return freqVector;
}

Source File: DocToDoubleVectorUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms term vectors for a given document
 * @return a dense vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException {
  Double[] freqVector = null;
  if (docTerms != null) {
    freqVector = new Double[(int) docTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();

    while (docTermsEnum.next() != null) {
      long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
      freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      i++;
    }
  }
  return freqVector;
}

Source File: QERetrievalApp.java From lucene4ir with Apache License 2.0

6 votes

/**
 * Combines the individual term vectors of each document into a single list.
 * @param terms
 * @return
 */
public HashMap<String, QETerm> combineTerms(Vector<Terms> terms){
    HashMap<String, QETerm> combinedTerms = new HashMap<String, QETerm>();
    int numDocs = terms.size();
    for(Terms ts : terms){
        try {
            TermsEnum te = ts.iterator();
            BytesRef term;
            while ((term = te.next()) != null) {
                String tString = term.utf8ToString();
                QETerm qet = new QETerm(tString, te.totalTermFreq(),te.docFreq(),numDocs);
                if (combinedTerms.containsKey(tString)){
                    QETerm mergedTerm = qet.combine(combinedTerms.get(tString));
                    combinedTerms.replace(tString,mergedTerm);
                }
                else
                    combinedTerms.put(tString,qet);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return combinedTerms;
}

Source File: SimpleTextTermVectorsReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public long getSumTotalTermFreq() throws IOException {
  // TODO: make it constant-time
  long ttf = 0;
  TermsEnum iterator = iterator();
  for (BytesRef b = iterator.next(); b != null; b = iterator.next()) {
    ttf += iterator.totalTermFreq();
  }
  return ttf;
}

Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0

5 votes

private void updateWeights(IndexReader indexReader,
                           int docId, Boolean assignedClass, SortedMap<String, Double> weights,
                           double modifier, boolean updateFST) throws IOException {
  TermsEnum cte = textTerms.iterator();

  // get the doc term vectors
  Terms terms = indexReader.getTermVector(docId, textFieldName);

  if (terms == null) {
    throw new IOException("term vectors must be stored for field "
            + textFieldName);
  }

  TermsEnum termsEnum = terms.iterator();

  BytesRef term;

  while ((term = termsEnum.next()) != null) {
    cte.seekExact(term);
    if (assignedClass != null) {
      long termFreqLocal = termsEnum.totalTermFreq();
      // update weights
      Long previousValue = Util.get(fst, term);
      String termString = term.utf8ToString();
      weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
    }
  }
  if (updateFST) {
    updateFST(weights);
  }
}

Source File: TermInSetQuery.java From lucene-solr with Apache License 2.0

5 votes

TermAndState(String field, TermsEnum termsEnum) throws IOException {
  this.field = field;
  this.termsEnum = termsEnum;
  this.term = BytesRef.deepCopyOf(termsEnum.term());
  this.state = termsEnum.termState();
  this.docFreq = termsEnum.docFreq();
  this.totalTermFreq = termsEnum.totalTermFreq();
}

Source File: GraphTermsQParserPlugin.java From lucene-solr with Apache License 2.0

5 votes

private void collectTermStates(IndexReader reader,
                               List<LeafReaderContext> leaves,
                               TermStates[] contextArray,
                               Term[] queryTerms) throws IOException {
  TermsEnum termsEnum = null;
  for (LeafReaderContext context : leaves) {

    Terms terms = context.reader().terms(this.field);
    if (terms == null) {
      // field does not exist
      continue;
    }

    termsEnum = terms.iterator();

    if (termsEnum == TermsEnum.EMPTY) continue;

    for (int i = 0; i < queryTerms.length; i++) {
      Term term = queryTerms[i];
      TermStates termStates = contextArray[i];

      if (termsEnum.seekExact(term.bytes())) {
        if (termStates == null) {
          contextArray[i] = new TermStates(reader.getContext(),
              termsEnum.termState(), context.ord, termsEnum.docFreq(),
              termsEnum.totalTermFreq());
        } else {
          termStates.register(termsEnum.termState(), context.ord,
              termsEnum.docFreq(), termsEnum.totalTermFreq());
        }
      }
    }
  }
}

Source File: CodecCollector.java From mtas with Apache License 2.0

5 votes

/**
 * Compute termvector number basic.
 *
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    TermsEnum termsEnum, LeafReader r) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if (!hasDeletedDocuments) {
    result.valueSum[0] = termsEnum.totalTermFreq();
    result.docNumber = termsEnum.docFreq();
    if (result.valueSum[0] > -1) {
      return result;
    }
  }
  throw new IOException("should not call this");
}

Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0

4 votes

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}

Source File: TermFreqAnalyser.java From Siamese with GNU General Public License v3.0

4 votes

private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
        String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
                + indexName + "/0/index";
        DecimalFormat df = new DecimalFormat("#.00");
        int printEvery = 100000;
        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            if (!outputFile.delete()) {
                System.out.println("ERROR: cannot delete the output file.");
                System.exit(0);
            }
        }
        /* adapted from
        https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index
         */
        int count = 0;
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile)));
            Fields fields = MultiFields.getFields(reader);
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            int size = 0;
            // TODO: is there a better solution?
            // iterate to get the size
            while (termsEnum.next() != null) {
                size++;
            }
//            String[] termArr = new String[size];
            long[] freqArr = new long[size];
            // do the real work
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
//                String term = termsEnum.term().utf8ToString();
                long tfreq = 0;
                if (freqType.equals("tf"))
                    tfreq = termsEnum.totalTermFreq();
                else if (freqType.equals("df"))
                    tfreq = termsEnum.docFreq();
                else {
                    System.out.println("Wrong frequency. Quit!");
                    System.exit(0);
                }
//                termArr[count] = term;
                freqArr[count] = tfreq;
                if (count % printEvery == 0) {
                    System.out.println("processed: " + count + " terms "
                            + " [" + df.format(((long)count * 100)/size) + "%]");
                }
                count++;
            }
            System.out.println(field + ": total = " + count);
            double[] data = new double[size];
            String output = "freq\n";
            for (int i = 0; i < freqArr.length; i++) {
                data[i] = freqArr[i];
                output += freqArr[i] + "\n";
                if (i > 0 && i % printEvery == 0) {
                    MyUtils.writeToFile("./", outputFileName, output, true);
                    System.out.println("written: " + i + " terms "
                            + " [" + df.format(((long)i * 100)/size) + "%]");
                    output = "";
                }
            }
            // write the rest to the file
            MyUtils.writeToFile("./",outputFileName, output, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

Java Code Examples for org.apache.lucene.index.TermsEnum#totalTermFreq()