org.apache.lucene.index.MultiFields Java Examples
The following examples show how to use
org.apache.lucene.index.MultiFields.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransportFieldStatsTransportAction.java From Elasticsearch with Apache License 2.0 | 7 votes |
@Override protected FieldStatsShardResponse shardOperation(FieldStatsShardRequest request) { ShardId shardId = request.shardId(); Map<String, FieldStats> fieldStats = new HashMap<>(); IndexService indexServices = indicesService.indexServiceSafe(shardId.getIndex()); MapperService mapperService = indexServices.mapperService(); IndexShard shard = indexServices.shardSafe(shardId.id()); try (Engine.Searcher searcher = shard.acquireSearcher("fieldstats")) { for (String field : request.getFields()) { MappedFieldType fieldType = mapperService.fullName(field); if (fieldType != null) { IndexReader reader = searcher.reader(); Terms terms = MultiFields.getTerms(reader, field); if (terms != null) { fieldStats.put(field, fieldType.stats(terms, reader.maxDoc())); } } else { throw new IllegalArgumentException("field [" + field + "] doesn't exist"); } } } catch (IOException e) { throw ExceptionsHelper.convertToElastic(e); } return new FieldStatsShardResponse(shardId, fieldStats); }
Example #2
Source File: FieldsConsumer.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Merges in the fields from the readers in * <code>mergeState</code>. The default implementation skips * and maps around deleted documents, and calls {@link #write(Fields,NormsProducer)}. * Implementations can override this method for more sophisticated * merging (bulk-byte copying, etc). */ public void merge(MergeState mergeState, NormsProducer norms) throws IOException { final List<Fields> fields = new ArrayList<>(); final List<ReaderSlice> slices = new ArrayList<>(); int docBase = 0; for(int readerIndex=0;readerIndex<mergeState.fieldsProducers.length;readerIndex++) { final FieldsProducer f = mergeState.fieldsProducers[readerIndex]; final int maxDoc = mergeState.maxDocs[readerIndex]; f.checkIntegrity(); slices.add(new ReaderSlice(docBase, maxDoc, readerIndex)); fields.add(f); docBase += maxDoc; } Fields mergedFields = new MappedMultiFields(mergeState, new MultiFields(fields.toArray(Fields.EMPTY_ARRAY), slices.toArray(ReaderSlice.EMPTY_ARRAY))); write(mergedFields, norms); }
Example #3
Source File: MtasFieldsConsumer.java From mtas with Apache License 2.0 | 6 votes |
@Override public void merge(MergeState mergeState) throws IOException { final List<Fields> fields = new ArrayList<>(); final List<ReaderSlice> slices = new ArrayList<>(); int docBase = 0; for (int readerIndex = 0; readerIndex < mergeState.fieldsProducers.length; readerIndex++) { final FieldsProducer f = mergeState.fieldsProducers[readerIndex]; final int maxDoc = mergeState.maxDocs[readerIndex]; f.checkIntegrity(); slices.add(new ReaderSlice(docBase, maxDoc, readerIndex)); fields.add(f); docBase += maxDoc; } Fields mergedFields = new MappedMultiFields(mergeState, new MultiFields(fields.toArray(Fields.EMPTY_ARRAY), slices.toArray(ReaderSlice.EMPTY_ARRAY))); write(mergedFields); }
Example #4
Source File: LuceneUtils.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * @param flagConfig Contains all information necessary for configuring LuceneUtils. * {@link FlagConfig#luceneindexpath()} must be non-empty. */ public LuceneUtils(FlagConfig flagConfig) throws IOException { if (flagConfig.luceneindexpath().isEmpty()) { throw new IllegalArgumentException( "-luceneindexpath is a required argument for initializing LuceneUtils instance."); } this.compositeReader = DirectoryReader.open( FSDirectory.open(FileSystems.getDefault().getPath(flagConfig.luceneindexpath()))); this.leafReader = SlowCompositeReaderWrapper.wrap(compositeReader); MultiFields.getFields(compositeReader); this.flagConfig = flagConfig; if (!flagConfig.stoplistfile().isEmpty()) loadStopWords(flagConfig.stoplistfile()); if (!flagConfig.startlistfile().isEmpty()) loadStartWords(flagConfig.startlistfile()); VerbatimLogger.info("Initialized LuceneUtils from Lucene index in directory: " + flagConfig.luceneindexpath() + "\n"); VerbatimLogger.info("Fields in index are: " + String.join(", ", this.getFieldNames()) + "\n"); }
Example #5
Source File: TermDocIterable.java From incubator-retired-blur with Apache License 2.0 | 6 votes |
private boolean getNext() { try { int next = docsEnum.nextDoc(); if (next == DocIdSetIterator.NO_MORE_DOCS) { return false; } Bits liveDocs = MultiFields.getLiveDocs(reader); if (liveDocs != null) { while (!liveDocs.get(docsEnum.docID())) { next = docsEnum.nextDoc(); } } return next == DocIdSetIterator.NO_MORE_DOCS ? false : true; } catch (IOException e) { throw new RuntimeException(e); } }
Example #6
Source File: LuceneIndexCorpus.java From word2vec-lucene with Apache License 2.0 | 6 votes |
@Override public void learnVocab() throws IOException { super.learnVocab(); final String field = ((LuceneIndexConfig)config).getField(); final Terms terms = MultiFields.getTerms(reader, field); final BytesRef maxTerm = terms.getMax(); final BytesRef minTerm = terms.getMin(); Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true); IndexSearcher searcher = new IndexSearcher(reader); topDocs = searcher.search(q, Integer.MAX_VALUE); TermsEnum termsEnum = null; termsEnum = terms.iterator(termsEnum); termsEnum.seekCeil(new BytesRef()); BytesRef term = termsEnum.term(); while(term != null){ int p = addWordToVocab(term.utf8ToString()); vocab[p].setCn((int)termsEnum.totalTermFreq()); term = termsEnum.next(); } }
Example #7
Source File: CustomSpellCheckListner.java From customized-symspell with MIT License | 5 votes |
/** * Relod method of spellcheck listner * @param newSearcher * @param checker * @throws IOException * @throws SpellCheckException */ public void reload(SolrIndexSearcher newSearcher, SpellChecker checker) throws IOException, SpellCheckException { DirectoryReader productsIndexReader = newSearcher.getIndexReader(); Fields fields = MultiFields.getFields(productsIndexReader); IndexSchema schema = newSearcher.getCore().getLatestSchema(); long time = System.currentTimeMillis(); for (String field : fields) { if (!fieldArr.contains(field)) { continue; } FieldType type = schema.getField(field).getType(); int insertionsCount = 0; for (TermsEnum iterator = fields.terms(field).iterator(); iterator.next() != null; ) { BytesRef term = iterator.term(); CharsRefBuilder charsRefBuilder = new CharsRefBuilder(); type.indexedToReadable(term, charsRefBuilder); insertionsCount++; checker.getDataHolder().addItem( new DictionaryItem(charsRefBuilder.toString().trim(), (double) iterator.totalTermFreq(), 0.0)); } log.info("Spellcheck Dictionary populated for Field Name {}, Count {}", field, insertionsCount); } log.info("Data for SpellChecker was populated. Time={} ms", (System.currentTimeMillis() - time)); }
Example #8
Source File: SORecommender.java From scava with Eclipse Public License 2.0 | 5 votes |
private List<String> getAllIndexTags(String INDEX_DIRECTORY) { Collection<String> result = new HashSet<String>(); try { IndexReader luceneIndexReader = DirectoryReader.open(FSDirectory.open(Paths.get(INDEX_DIRECTORY))); result = MultiFields.getIndexedFields(luceneIndexReader); } catch (IOException e) { logger.error(e.getMessage()); } List<String> sortedList = new ArrayList<String>(result); Collections.sort(sortedList); return sortedList; }
Example #9
Source File: TermFreq.java From SourcererCC with GNU General Public License v3.0 | 5 votes |
private void dummy() throws IOException { Fields fields = MultiFields.getFields(this.reader); Terms terms = fields.terms("field"); TermsEnum iterator = terms.iterator(null); BytesRef byteRef = null; while ((byteRef = iterator.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); Term termInstance = new Term("tokens", term); long termFreq = this.reader.totalTermFreq(termInstance); this.TermFreqMap.put(term, termFreq); System.out.println(termFreq); } }
Example #10
Source File: WordScorer.java From Elasticsearch with Apache License 2.0 | 4 votes |
public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException { this(reader, MultiFields.getTerms(reader, field), field, realWordLikelyHood, separator); }
Example #11
Source File: AlfrescoLukeRequestHandler.java From SearchServices with GNU Lesser General Public License v3.0 | 4 votes |
@SuppressWarnings("unchecked") private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap) throws IOException { SolrParams params = req.getParams(); final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT); TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to // collect the top N // terms in. final CharsRefBuilder spare = new CharsRefBuilder(); Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(), field); if (terms == null) { // field does not exist return; } TermsEnum termsEnum = terms.iterator(); BytesRef text; int[] buckets = new int[HIST_ARRAY_SIZE]; while ((text = termsEnum.next()) != null) { ++tiq.distinctTerms; int freq = termsEnum.docFreq(); // This calculation seems odd, but // it gives the same results as it // used to. int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1)); buckets[slot] = buckets[slot] + 1; if (numTerms > 0 && freq > tiq.minFreq) { spare.copyUTF8Bytes(text); String t = spare.toString(); tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum .docFreq())); if (tiq.size() > numTerms) { // if tiq full tiq.pop(); // remove lowest in tiq tiq.minFreq = tiq.getTopTermInfo().docFreq; } } } tiq.histogram.add(buckets); fieldMap.add("distinct", tiq.distinctTerms); // Include top terms fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema())); // Add a histogram fieldMap.add("histogram", tiq.histogram.toNamedList()); }
Example #12
Source File: IndexLoader.java From solr-autocomplete with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws CorruptIndexException, IOException, SolrServerException { if (args.length < 3) { System.err.println("Usage: java -Dfile.encoding=UTF8 -Dclient.encoding.override=UTF-8 -Xmx256m -Xms256m -server " + IndexLoader.class.getName() + " </path/to/index> <AutoCompleteSolrUrl> <indexField1,acField1> [indexField2,acField2 ... ]"); System.exit(0); } Map<String,String> fieldMap = getFieldMapping(args, 2); DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(args[0]))); int docs = reader.maxDoc(); SolrClient solr = new ConcurrentUpdateSolrClient.Builder(args[1]).withQueueSize(10000).withThreadCount(2).build(); Set<SolrInputDocument> batch = new HashSet<SolrInputDocument>(1000); Bits liveDocs = MultiFields.getLiveDocs(reader); // go through all docs in the index for (int i = 0; i < docs; i++) { // process doc only if not deleted if (liveDocs == null || liveDocs.get(i)) { // loop through all fields to be looked at SolrInputDocument doc = new SolrInputDocument(); Iterator<String> iter = fieldMap.keySet().iterator(); boolean phraseFieldEmpty = false; while (iter.hasNext()) { String indexField = iter.next(); String acField = fieldMap.get(indexField); IndexableField field = reader.document(i).getField(indexField); String value = field != null ? reader.document(i).getField(indexField).stringValue() : null; if (field != null && value != null && !value.isEmpty()) { doc.addField(acField, value); } else { // not very relevant piece of info // System.err.println("Field is null or empty, skipping: " + indexField); if (acField.equalsIgnoreCase("phrase")) { System.err.println("Since AC phrase field would be null, this doc will not be created: " + reader.document(i)); phraseFieldEmpty = true; break; } } } if (!phraseFieldEmpty) { solr.add(doc); if (docs % 1000 == 0) { System.out.println("Docs: " + docs); } } } } if (!batch.isEmpty()) solr.add(batch); reader.close(); System.out.println("Optimizing..."); solr.optimize(); solr.close(); }
Example #13
Source File: TermFreqAnalyser.java From Siamese with GNU General Public License v3.0 | 4 votes |
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) { String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/" + indexName + "/0/index"; DecimalFormat df = new DecimalFormat("#.00"); int printEvery = 100000; File outputFile = new File(outputFileName); if (outputFile.exists()) { if (!outputFile.delete()) { System.out.println("ERROR: cannot delete the output file."); System.exit(0); } } /* adapted from https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index */ int count = 0; try { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile))); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); int size = 0; // TODO: is there a better solution? // iterate to get the size while (termsEnum.next() != null) { size++; } // String[] termArr = new String[size]; long[] freqArr = new long[size]; // do the real work termsEnum = terms.iterator(); while (termsEnum.next() != null) { // String term = termsEnum.term().utf8ToString(); long tfreq = 0; if (freqType.equals("tf")) tfreq = termsEnum.totalTermFreq(); else if (freqType.equals("df")) tfreq = termsEnum.docFreq(); else { System.out.println("Wrong frequency. Quit!"); System.exit(0); } // termArr[count] = term; freqArr[count] = tfreq; if (count % printEvery == 0) { System.out.println("processed: " + count + " terms " + " [" + df.format(((long)count * 100)/size) + "%]"); } count++; } System.out.println(field + ": total = " + count); double[] data = new double[size]; String output = "freq\n"; for (int i = 0; i < freqArr.length; i++) { data[i] = freqArr[i]; output += freqArr[i] + "\n"; if (i > 0 && i % printEvery == 0) { MyUtils.writeToFile("./", outputFileName, output, true); System.out.println("written: " + i + " terms " + " [" + df.format(((long)i * 100)/size) + "%]"); output = ""; } } // write the rest to the file MyUtils.writeToFile("./",outputFileName, output, true); } catch (IOException e) { e.printStackTrace(); } }
Example #14
Source File: IndexRequestMasterListenerIT.java From development with Apache License 2.0 | 4 votes |
private void assertDocsInIndex(final Class<?> clazz, final String comment, final int expectedNumDocs, final int expectedNumIndexedAttributes, final List<String> expectedAttributes) throws Exception { Boolean evaluationTookPlace = runTX(new Callable<Boolean>() { @Override public Boolean call() throws Exception { boolean evaluatedIndex = false; Session session = dm.getSession(); if (session != null) { FullTextSession fullTextSession = Search .getFullTextSession(session); SearchFactory searchFactory = fullTextSession .getSearchFactory(); IndexReader reader = searchFactory.getIndexReaderAccessor() .open(clazz); try { assertEquals(comment, expectedNumDocs, reader.numDocs()); if (expectedNumDocs > 0) { final FieldInfos indexedFieldNames = MultiFields .getMergedFieldInfos(reader); for (String expectedAttr : expectedAttributes) { assertNotNull( "attribute " + expectedAttr + " does not exist in index: " + indexedFieldNames, indexedFieldNames .fieldInfo(expectedAttr)); } assertNotNull( "attribute \"key\" does not exist in index: " + indexedFieldNames, indexedFieldNames.fieldInfo("key")); assertNotNull( "attribute \"_hibernate_class\" does not exist in index: " + indexedFieldNames, indexedFieldNames .fieldInfo("_hibernate_class")); assertEquals( "More or less attributes indexed than expected, attributes retrieved from index: " + indexedFieldNames, expectedNumIndexedAttributes + 2, indexedFieldNames.size()); evaluatedIndex = true; } } finally { searchFactory.getIndexReaderAccessor().close(reader); } } return Boolean.valueOf(evaluatedIndex); } }); if (expectedNumDocs > 0) { Assert.assertTrue("Index not found, no evaluation took place", evaluationTookPlace.booleanValue()); } }
Example #15
Source File: TermSearcher.java From SourcererCC with GNU General Public License v3.0 | 4 votes |
public synchronized void searchWithPosition(int queryTermsSeen) { if (null != this.reader) { if (null != this.reader.getContext()) { if (null != this.reader.getContext().leaves()) { Term term = new Term("tokens", this.searchTerm); for (AtomicReaderContext ctx : this.reader.getContext() .leaves()) { int base = ctx.docBase; // SpanTermQuery spanQ = new SpanTermQuery(term); try { DocsAndPositionsEnum docEnum = MultiFields .getTermPositionsEnum(ctx.reader(), MultiFields.getLiveDocs(ctx .reader()), "tokens", term .bytes()); if (null != docEnum) { int doc = DocsEnum.NO_MORE_DOCS; while ((doc = docEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) { long docId = doc + base; CandidateSimInfo simInfo = null; if (this.simMap.containsKey(docId)) { simInfo = this.simMap.get(docId); simInfo.similarity = simInfo.similarity + Math.min(freqTerm, docEnum.freq()); } else { if (earlierDocs.contains(docId)) continue; Document d = SearchManager.searcher .get(shard).getDocument(docId); long candidateId = Long.parseLong(d .get("id")); // Get rid of these early -- we're only // looking for candidates // whose ids are smaller than the query if (candidateId >= this.queryId) { // System.out.println("Query " + // this.queryId + // ", getting rid of " + // candidateId); earlierDocs.add(docId); continue; // we reject the candidate } simInfo = new CandidateSimInfo(); simInfo.doc = d; simInfo.candidateSize = Integer .parseInt(d.get("size")); simInfo.similarity = Math.min(freqTerm, docEnum.freq()); // System.out.println("before putting in simmap "+ // Util.debug_thread()); this.simMap.put(docId, simInfo); // System.out.println("after putting in simmap "+ // Util.debug_thread()); } simInfo.queryMatchPosition = queryTermsSeen; int candidatePos = docEnum.nextPosition(); simInfo.candidateMatchPosition = candidatePos + docEnum.freq(); if (!Util.isSatisfyPosFilter( this.simMap.get(docId).similarity, this.querySize, queryTermsSeen, simInfo.candidateSize, simInfo.candidateMatchPosition, this.computedThreshold)) { // System.out.println("before removing in simmap "+ // Util.debug_thread()); this.simMap.remove(docId); // System.out.println("after removing in simmap "+ // Util.debug_thread()); } } } else { logger.trace("docEnum is null, " + base + ", term: " + this.searchTerm + Util.debug_thread()); } } catch (Exception e) { e.printStackTrace(); logger.error("exception caught " + e.getMessage() + Util.debug_thread() + " search term:" + this.searchTerm); } } } else { logger.debug("leaves are null, " + this.searchTerm + Util.debug_thread()); } } else { logger.debug("getContext is null, " + this.searchTerm + Util.debug_thread()); } } else { logger.debug("this.reader is null, " + this.searchTerm + Util.debug_thread()); } }
Example #16
Source File: LuceneIndexTest.java From rdf4j with BSD 3-Clause "New" or "Revised" License | 2 votes |
/** * NB: this is a convenient but very slow way of getting termDocs. It is sufficient for testing purposes. * * @throws IOException */ private static PostingsEnum termDocs(IndexReader reader, Term term) throws IOException { return MultiFields.getTermDocsEnum(reader, term.field(), term.bytes()); }