Java Code Examples for org.apache.lucene.index.TermsEnum#totalTermFreq()
The following examples show how to use
org.apache.lucene.index.TermsEnum#totalTermFreq() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReadmeSimilarityCalculator.java From scava with Eclipse Public License 2.0 | 6 votes |
private DocVector[] getDocumentVectors() throws IOException { DocVector[] docVector = new DocVector[getTotalDocumentInIndex()]; for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) { Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT); TermsEnum termsEnum = null; termsEnum = vector.iterator(); BytesRef text = null; docVector[docId] = new DocVector(getAllTerms()); while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); docVector[docId].setEntry(term, freq); } docVector[docId].normalize(); } getIndexReader().close(); return docVector; }
Example 2
Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Returns a new term vector entry representing the specified term, and optionally, positions. * * @param te - positioned terms iterator * @return term vector entry * @throws IOException - if there is a low level IO error. */ static TermVectorEntry of(TermsEnum te) throws IOException { Objects.requireNonNull(te); String termText = BytesRefUtils.decode(te.term()); List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>(); PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); pe.nextDoc(); int freq = pe.freq(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < 0) { // no position information available continue; } TermVectorPosition tvPos = TermVectorPosition.of(pos, pe); tvPositions.add(tvPos); } return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions); }
Example 3
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param field2termFreqMap a Map of terms and their frequencies per field * @param vector List of terms and their frequencies for a doc/field */ private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException { Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>()); final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
Example 4
Source File: DocToDoubleVectorUtils.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc * * @param docTerms term vectors for a given document * @param fieldTerms field term vectors * @return a sparse vector of <code>Double</code>s as an array * @throws IOException in case accessing the underlying index fails */ public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException { TermsEnum fieldTermsEnum = fieldTerms.iterator(); Double[] freqVector = null; if (docTerms != null && fieldTerms.size() > -1) { freqVector = new Double[(int) fieldTerms.size()]; int i = 0; TermsEnum docTermsEnum = docTerms.iterator(); BytesRef term; while ((term = fieldTermsEnum.next()) != null) { TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term); if (seekStatus.equals(TermsEnum.SeekStatus.END)) { docTermsEnum = docTerms.iterator(); } if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) { long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document freqVector[i] = Long.valueOf(termFreqLocal).doubleValue(); } else { freqVector[i] = 0d; } i++; } } return freqVector; }
Example 5
Source File: DocToDoubleVectorUtils.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc * * @param docTerms term vectors for a given document * @return a dense vector of <code>Double</code>s as an array * @throws IOException in case accessing the underlying index fails */ public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException { Double[] freqVector = null; if (docTerms != null) { freqVector = new Double[(int) docTerms.size()]; int i = 0; TermsEnum docTermsEnum = docTerms.iterator(); while (docTermsEnum.next() != null) { long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document freqVector[i] = Long.valueOf(termFreqLocal).doubleValue(); i++; } } return freqVector; }
Example 6
Source File: QERetrievalApp.java From lucene4ir with Apache License 2.0 | 6 votes |
/** * Combines the individual term vectors of each document into a single list. * @param terms * @return */ public HashMap<String, QETerm> combineTerms(Vector<Terms> terms){ HashMap<String, QETerm> combinedTerms = new HashMap<String, QETerm>(); int numDocs = terms.size(); for(Terms ts : terms){ try { TermsEnum te = ts.iterator(); BytesRef term; while ((term = te.next()) != null) { String tString = term.utf8ToString(); QETerm qet = new QETerm(tString, te.totalTermFreq(),te.docFreq(),numDocs); if (combinedTerms.containsKey(tString)){ QETerm mergedTerm = qet.combine(combinedTerms.get(tString)); combinedTerms.replace(tString,mergedTerm); } else combinedTerms.put(tString,qet); } } catch (IOException e) { e.printStackTrace(); } } return combinedTerms; }
Example 7
Source File: SimpleTextTermVectorsReader.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public long getSumTotalTermFreq() throws IOException { // TODO: make it constant-time long ttf = 0; TermsEnum iterator = iterator(); for (BytesRef b = iterator.next(); b != null; b = iterator.next()) { ttf += iterator.totalTermFreq(); } return ttf; }
Example 8
Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0 | 5 votes |
private void updateWeights(IndexReader indexReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException { TermsEnum cte = textTerms.iterator(); // get the doc term vectors Terms terms = indexReader.getTermVector(docId, textFieldName); if (terms == null) { throw new IOException("term vectors must be stored for field " + textFieldName); } TermsEnum termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { cte.seekExact(term); if (assignedClass != null) { long termFreqLocal = termsEnum.totalTermFreq(); // update weights Long previousValue = Util.get(fst, term); String termString = term.utf8ToString(); weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal)); } } if (updateFST) { updateFST(weights); } }
Example 9
Source File: TermInSetQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
TermAndState(String field, TermsEnum termsEnum) throws IOException { this.field = field; this.termsEnum = termsEnum; this.term = BytesRef.deepCopyOf(termsEnum.term()); this.state = termsEnum.termState(); this.docFreq = termsEnum.docFreq(); this.totalTermFreq = termsEnum.totalTermFreq(); }
Example 10
Source File: GraphTermsQParserPlugin.java From lucene-solr with Apache License 2.0 | 5 votes |
private void collectTermStates(IndexReader reader, List<LeafReaderContext> leaves, TermStates[] contextArray, Term[] queryTerms) throws IOException { TermsEnum termsEnum = null; for (LeafReaderContext context : leaves) { Terms terms = context.reader().terms(this.field); if (terms == null) { // field does not exist continue; } termsEnum = terms.iterator(); if (termsEnum == TermsEnum.EMPTY) continue; for (int i = 0; i < queryTerms.length; i++) { Term term = queryTerms[i]; TermStates termStates = contextArray[i]; if (termsEnum.seekExact(term.bytes())) { if (termStates == null) { contextArray[i] = new TermStates(reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termStates.register(termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } }
Example 11
Source File: CodecCollector.java From mtas with Apache License 2.0 | 5 votes |
/** * Compute termvector number basic. * * @param termsEnum * the terms enum * @param r * the r * @return the termvector number basic * @throws IOException * Signals that an I/O exception has occurred. */ private static TermvectorNumberBasic computeTermvectorNumberBasic( TermsEnum termsEnum, LeafReader r) throws IOException { TermvectorNumberBasic result = new TermvectorNumberBasic(); boolean hasDeletedDocuments = (r.getLiveDocs() != null); if (!hasDeletedDocuments) { result.valueSum[0] = termsEnum.totalTermFreq(); result.docNumber = termsEnum.docFreq(); if (result.valueSum[0] > -1) { return result; } } throw new IOException("should not call this"); }
Example 12
Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0 | 4 votes |
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException { NamedList<Object> fieldNL = new NamedList<>(); docNL.add(field, fieldNL); BytesRef text; PostingsEnum dpEnum = null; while((text = termsEnum.next()) != null) { String term = text.utf8ToString(); NamedList<Object> termInfo = new NamedList<>(); fieldNL.add(term, termInfo); final int freq = (int) termsEnum.totalTermFreq(); if (fieldOptions.termFreq == true) { termInfo.add("tf", freq); } int dpEnumFlags = 0; dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0; //payloads require offsets dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0; dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0; dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); boolean atNextDoc = false; if (dpEnum != null) { dpEnum.nextDoc(); atNextDoc = true; } if (atNextDoc && dpEnumFlags != 0) { NamedList<Integer> positionsNL = null; NamedList<Number> theOffsets = null; NamedList<String> thePayloads = null; for (int i = 0; i < freq; i++) { final int pos = dpEnum.nextPosition(); if (fieldOptions.positions && pos >= 0) { if (positionsNL == null) { positionsNL = new NamedList<>(); termInfo.add("positions", positionsNL); } positionsNL.add("position", pos); } int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1; if (startOffset >= 0) { if (theOffsets == null) { theOffsets = new NamedList<>(); termInfo.add("offsets", theOffsets); } theOffsets.add("start", dpEnum.startOffset()); theOffsets.add("end", dpEnum.endOffset()); } BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null; if (payload != null) { if (thePayloads == null) { thePayloads = new NamedList<>(); termInfo.add("payloads", thePayloads); } thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length)); } } } int df = 0; if (fieldOptions.docFreq || fieldOptions.tfIdf) { df = reader.docFreq(new Term(field, text)); } if (fieldOptions.docFreq) { termInfo.add("df", df); } // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } }
Example 13
Source File: TermFreqAnalyser.java From Siamese with GNU General Public License v3.0 | 4 votes |
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) { String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/" + indexName + "/0/index"; DecimalFormat df = new DecimalFormat("#.00"); int printEvery = 100000; File outputFile = new File(outputFileName); if (outputFile.exists()) { if (!outputFile.delete()) { System.out.println("ERROR: cannot delete the output file."); System.exit(0); } } /* adapted from https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index */ int count = 0; try { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile))); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); int size = 0; // TODO: is there a better solution? // iterate to get the size while (termsEnum.next() != null) { size++; } // String[] termArr = new String[size]; long[] freqArr = new long[size]; // do the real work termsEnum = terms.iterator(); while (termsEnum.next() != null) { // String term = termsEnum.term().utf8ToString(); long tfreq = 0; if (freqType.equals("tf")) tfreq = termsEnum.totalTermFreq(); else if (freqType.equals("df")) tfreq = termsEnum.docFreq(); else { System.out.println("Wrong frequency. Quit!"); System.exit(0); } // termArr[count] = term; freqArr[count] = tfreq; if (count % printEvery == 0) { System.out.println("processed: " + count + " terms " + " [" + df.format(((long)count * 100)/size) + "%]"); } count++; } System.out.println(field + ": total = " + count); double[] data = new double[size]; String output = "freq\n"; for (int i = 0; i < freqArr.length; i++) { data[i] = freqArr[i]; output += freqArr[i] + "\n"; if (i > 0 && i % printEvery == 0) { MyUtils.writeToFile("./", outputFileName, output, true); System.out.println("written: " + i + " terms " + " [" + df.format(((long)i * 100)/size) + "%]"); output = ""; } } // write the rest to the file MyUtils.writeToFile("./",outputFileName, output, true); } catch (IOException e) { e.printStackTrace(); } }