Java Code Examples for org.apache.lucene.index.TermsEnum#term()
The following examples show how to use
org.apache.lucene.index.TermsEnum#term() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractFeatureBuilder.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
protected Set<String> getUniqueWords() throws JATEException, IOException { Terms ngramInfo = SolrUtil.getTermVector(properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher); TermsEnum termsEnum = ngramInfo.iterator(); Set<String> allWords = new HashSet<>(); while (termsEnum.next() != null) { BytesRef t = termsEnum.term(); if (t.length == 0) continue; String termStr=t.utf8ToString(); if(!termStr.contains(" ")) allWords.add(termStr); } if(allWords.size()==0) throw new JATEException("MWEMetadata are required on 'Words', however there are no single-token lexical units in the "+ properties.getSolrFieldNameJATENGramInfo()+" field. Check to see if your analyzer pipeline outputs uni-grams"); return allWords; }
Example 2
Source File: LuceneIndexCorpus.java From word2vec-lucene with Apache License 2.0 | 6 votes |
@Override public void learnVocab() throws IOException { super.learnVocab(); final String field = ((LuceneIndexConfig)config).getField(); final Terms terms = MultiFields.getTerms(reader, field); final BytesRef maxTerm = terms.getMax(); final BytesRef minTerm = terms.getMin(); Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true); IndexSearcher searcher = new IndexSearcher(reader); topDocs = searcher.search(q, Integer.MAX_VALUE); TermsEnum termsEnum = null; termsEnum = terms.iterator(termsEnum); termsEnum.seekCeil(new BytesRef()); BytesRef term = termsEnum.term(); while(term != null){ int p = addWordToVocab(term.utf8ToString()); vocab[p].setCn((int)termsEnum.totalTermFreq()); term = termsEnum.next(); } }
Example 3
Source File: TermGroupFacetCollector.java From lucene-solr with Apache License 2.0 | 5 votes |
SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException { super(counts, total - counts[missingCountIndex], counts[missingCountIndex], endFacetOrd == missingCountIndex + 1 ? missingCountIndex : endFacetOrd); this.tenum = tenum; this.mergePos = startFacetOrd; if (tenum != null) { tenum.seekExact(mergePos); mergeTerm = tenum.term(); } }
Example 4
Source File: SrndTruncQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { int prefixLength = prefix.length(); Terms terms = MultiTerms.getTerms(reader, fieldName); if (terms != null) { Matcher matcher = pattern.matcher(""); try { TermsEnum termsEnum = terms.iterator(); TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef); BytesRef text; if (status == TermsEnum.SeekStatus.FOUND) { text = prefixRef; } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { text = termsEnum.term(); } else { text = null; } while(text != null) { if (text != null && StringHelper.startsWith(text, prefixRef)) { String textString = text.utf8ToString(); matcher.reset(textString.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(new Term(fieldName, textString)); } } else { break; } text = termsEnum.next(); } } finally { matcher.reset(); } } }
Example 5
Source File: TestFSTs.java From lucene-solr with Apache License 2.0 | 5 votes |
private void assertSame(TermsEnum termsEnum, BytesRefFSTEnum<?> fstEnum, boolean storeOrd) throws Exception { if (termsEnum.term() == null) { assertNull(fstEnum.current()); } else { assertNotNull(fstEnum.current()); assertEquals(termsEnum.term().utf8ToString() + " != " + fstEnum.current().input.utf8ToString(), termsEnum.term(), fstEnum.current().input); if (storeOrd) { // fst stored the ord assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.ord(), ((Long) fstEnum.current().output).longValue()); } else { // fst stored the docFreq assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); } } }
Example 6
Source File: UnInvertedField.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Called for each term in the field being uninverted. * Collects {@link #maxTermCounts} for all bigTerms as well as storing them in {@link #bigTerms}. * @param te positioned at the current term. * @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between calls. */ @Override protected void visitTerm(TermsEnum te, int termNum) throws IOException { if (termNum >= maxTermCounts.length) { // resize by doubling - for very large number of unique terms, expanding // by 4K and resultant GC will dominate uninvert times. Resize at end if material int[] newMaxTermCounts = new int[ Math.min(Integer.MAX_VALUE-16, maxTermCounts.length*2) ]; System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum); maxTermCounts = newMaxTermCounts; } final BytesRef term = te.term(); if (te.docFreq() > maxTermDocFreq) { Term t = new Term(field, term); // this makes a deep copy of the term bytes TopTerm topTerm = new TopTerm(); topTerm.term = t.bytes(); topTerm.termNum = termNum; topTerm.termQuery = new TermQuery(t); bigTerms.put(topTerm.termNum, topTerm); if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = searcher.getLiveDocsBits(); deState.termsEnum = te; // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail? deState.postingsEnum = postingsEnum; deState.minSetSizeCached = maxTermDocFreq; } postingsEnum = deState.postingsEnum; DocSet set = searcher.getDocSet(deState); maxTermCounts[termNum] = set.size(); } }
Example 7
Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0 | 5 votes |
private void buildFieldMap( ResponseBuilder rb ) throws IOException { Log.debug( "buildFieldMap" ); SolrIndexSearcher searcher = rb.req.getSearcher(); // build a synonym map from the SortedDocValues - // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true ); SynonymMap.Builder termBuilder = new SynonymMap.Builder( true ); ArrayList<String> searchFields = getStringFields( searcher ); for (String searchField : searchFields ) { Log.debug( "adding searchField " + searchField ); CharsRef fieldChars = new CharsRef( searchField ); SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField ); if (sdv == null) continue; Log.debug( "got SortedSetDocValues for " + searchField ); TermsEnum te = sdv.termsEnum(); while (te.next() != null) { BytesRef term = te.term(); String fieldValue = term.utf8ToString( ); addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder ); } } addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields ); fieldMap = fieldBuilder.build( ); termMap = termBuilder.build( ); }
Example 8
Source File: AbstractFeatureBuilder.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
/** * Retrieve term candidates from solr field * see @code {uk.ac.shef.dcs.jate.JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_TERMS} * * The method assumes that the term candidates are extracted at index-time and stored in pre-configured field * * @return Set, a set of term candidate surface form * @throws JATEException * @throws IOException */ protected Set<String> getUniqueTerms() throws JATEException, IOException { Terms terms =SolrUtil.getTermVector(properties.getSolrFieldNameJATECTerms(),solrIndexSearcher); //>>>>>>>>> /*TermsEnum source = terms.iterator(); String term = //"thrownawayorusedjustforelementarystatistical profile"; "l hierar hy"; //"ordertoavoidadependencyofthebaselineresultontherandom"; if (source.seekExact(new BytesRef(term.getBytes("UTF-8")))) { PostingsEnum docEnum = source.postings(null); int doc = 0; while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { int tfid = docEnum.freq(); //tf in document } } else { }*/ //>>>>>>>>> TermsEnum termsEnum = terms.iterator(); Set<String> allTermCandidates = new HashSet<>(); while (termsEnum.next() != null) { BytesRef t = termsEnum.term(); if (t.length == 0) continue; allTermCandidates.add(t.utf8ToString()); } return allTermCandidates; }
Example 9
Source File: MultiPhrasePrefixQuery.java From crate with Apache License 2.0 | 5 votes |
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
Example 10
Source File: TermIntervalsSource.java From lucene-solr with Apache License 2.0 | 4 votes |
static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException { TermQuery query = new TermQuery(new Term(field, te.term())); PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); if (pe.advance(doc) != doc) { return null; } return new IntervalMatchesIterator() { @Override public int gaps() { return 0; } @Override public int width() { return 1; } int upto = pe.freq(); int pos = -1; @Override public boolean next() throws IOException { if (upto <= 0) { pos = IntervalIterator.NO_MORE_INTERVALS; return false; } upto--; pos = pe.nextPosition(); return true; } @Override public int startPosition() { return pos; } @Override public int endPosition() { return pos; } @Override public int startOffset() throws IOException { return pe.startOffset(); } @Override public int endOffset() throws IOException { return pe.endOffset(); } @Override public MatchesIterator getSubMatches() { return null; } @Override public Query getQuery() { return query; } }; }
Example 11
Source File: DocTermOrds.java From lucene-solr with Apache License 2.0 | 4 votes |
/** Returns the term ({@link BytesRef}) corresponding to * the provided ordinal. */ public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException { termsEnum.seekExact(ord); return termsEnum.term(); }