Java Code Examples for org.apache.lucene.index.IndexWriter#setMergeFactor()
The following examples show how to use
org.apache.lucene.index.IndexWriter#setMergeFactor() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: IndexWriterWorker.java From olat with Apache License 2.0 | 6 votes |
/** * @param id * Unique index ID. Is used to generate unique directory name. * @param tempIndexPath * Absolute directory-path where the temporary index can be generated. * @param fullIndexer * Reference to full-index */ public IndexWriterWorker(final int id, final File tempIndexDir, final OlatFullIndexer fullIndexer) { this.id = id; this.indexPartDir = new File(tempIndexDir, "part" + id); this.fullIndexer = fullIndexer; try { final Directory luceneIndexPartDir = FSDirectory.open(indexPartDir); indexWriter = new IndexWriter(luceneIndexPartDir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); indexWriter.setMergeFactor(fullIndexer.getSearchModuleConfig().getIndexerWriterMergeFactor()); log.info("IndexWriter config MergeFactor=" + indexWriter.getMergeFactor()); indexWriter.setRAMBufferSizeMB(fullIndexer.getSearchModuleConfig().getIndexerWriterRamBuffer()); log.info("IndexWriter config RAMBufferSizeMB=" + indexWriter.getRAMBufferSizeMB()); indexWriter.setUseCompoundFile(false); } catch (final IOException e) { log.warn("Can not create IndexWriter"); } }
Example 2
Source File: TestMixedDirectory.java From RDFS with Apache License 2.0 | 5 votes |
public void updateIndex(Directory dir, int base, int numDocs, IndexDeletionPolicy policy) throws IOException { IndexWriter writer = new IndexWriter(dir, false, new StandardAnalyzer(), policy); writer.setMaxBufferedDocs(maxBufferedDocs); writer.setMergeFactor(1000); for (int i = 0; i < numDocs; i++) { addDoc(writer, base + i); } writer.close(); }
Example 3
Source File: TestMixedDirectory.java From hadoop-gpu with Apache License 2.0 | 5 votes |
public void updateIndex(Directory dir, int base, int numDocs, IndexDeletionPolicy policy) throws IOException { IndexWriter writer = new IndexWriter(dir, false, new StandardAnalyzer(), policy); writer.setMaxBufferedDocs(maxBufferedDocs); writer.setMergeFactor(1000); for (int i = 0; i < numDocs; i++) { addDoc(writer, base + i); } writer.close(); }
Example 4
Source File: BuildIndexForEntityFragments.java From gAnswer with BSD 3-Clause "New" or "Revised" License | 4 votes |
public void indexforentity() throws Exception { if(EntityFragmentFields.entityId2Name == null) EntityFragmentFields.load(); long startTime = new Date().getTime(); //Try update KB index to DBpedia2015. by husen 2016-04-08 //Try update KB index to DBpedia2016. by husen 2018-8-22 File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index"); File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"); Analyzer luceneAnalyzer_en = new StandardAnalyzer(); IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); int mergeFactor = 100000; //default 10 int maxBufferedDoc = 1000; //default 10 int maxMergeDoc = Integer.MAX_VALUE; //INF //indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; indexWriter_en.setMergeFactor(mergeFactor); indexWriter_en.setMaxBufferedDocs(maxBufferedDoc); indexWriter_en.setMaxMergeDocs(maxMergeDoc); FileInputStream file = new FileInputStream(sourceDir_en); InputStreamReader in = new InputStreamReader(file,"UTF-8"); BufferedReader br = new BufferedReader(in); int count = 0; while(true) { String _line = br.readLine(); { if(_line == null) break; } count++; if(count % 100000 == 0) System.out.println(count); String line = _line; String temp[] = line.split("\t"); if(temp.length != 2) continue; else { int entity_id = Integer.parseInt(temp[0]); if(!EntityFragmentFields.entityId2Name.containsKey(entity_id)) continue; String entity_name = EntityFragmentFields.entityId2Name.get(entity_id); String entity_fragment = temp[1]; entity_name = entity_name.replace("____", " "); entity_name = entity_name.replace("__", " "); entity_name = entity_name.replace("_", " "); Document document = new Document(); Field EntityName = new Field("EntityName", entity_name, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field EntityId = new Field("EntityId", String.valueOf(entity_id), Field.Store.YES, Field.Index.NO); Field EntityFragment = new Field("EntityFragment", entity_fragment, Field.Store.YES, Field.Index.NO); document.add(EntityName); document.add(EntityId); document.add(EntityFragment); indexWriter_en.addDocument(document); } } indexWriter_en.optimize(); indexWriter_en.close(); br.close(); // input the time of Build index long endTime = new Date().getTime(); System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime)); }
Example 5
Source File: BuildIndexForTypeShortName.java From gAnswer with BSD 3-Clause "New" or "Revised" License | 4 votes |
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception { long startTime = new Date().getTime(); File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index"); Analyzer luceneAnalyzer_li = new StandardAnalyzer(); IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); int mergeFactor = 100000; int maxBufferedDoc = 1000; int maxMergeDoc = Integer.MAX_VALUE; //indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; indexWriter_li.setMergeFactor(mergeFactor); indexWriter_li.setMaxBufferedDocs(maxBufferedDoc); indexWriter_li.setMaxMergeDocs(maxMergeDoc); int count = 0; Iterator<String> it = typeShortName2IdList.keySet().iterator(); while (it.hasNext()) { String sn = it.next(); if (sn.length() == 0) { continue; } count ++; StringBuilder splittedSn = new StringBuilder(""); if(sn.contains("_")) { String nsn = sn.replace("_", " "); splittedSn.append(nsn.toLowerCase()); } else { int last = 0, i = 0; for(i = 0; i < sn.length(); i ++) { // if it were not a small letter, then break it. if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) { splittedSn.append(sn.substring(last, i).toLowerCase()); splittedSn.append(' '); last = i; } } splittedSn.append(sn.substring(last, i).toLowerCase()); while(splittedSn.charAt(0) == ' ') { splittedSn.deleteCharAt(0); } } System.out.println("SplitttedType: "+splittedSn); Document document = new Document(); Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Field TypeShortName = new Field("TypeShortName", sn, Field.Store.YES, Field.Index.NO); document.add(SplittedTypeShortName); document.add(TypeShortName); indexWriter_li.addDocument(document); } indexWriter_li.optimize(); indexWriter_li.close(); // input the time of Build index long endTime = new Date().getTime(); System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime)); }
Example 6
Source File: OlatFullIndexer.java From olat with Apache License 2.0 | 4 votes |
/** * Create index-writer object. In multi-threaded mode ctreates an array of index-workers. Start indexing with main-index as root object. Index recursive all elements. * At the end optimze and close new index. The new index is stored in [temporary-index-path]/main * * @throws InterruptedException */ private void doIndex() throws InterruptedException { try { final File tempIndexDir = new File(tempIndexPath); final Directory indexPath = FSDirectory.open(new File(tempIndexDir, "main")); final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); indexWriter = new IndexWriter(indexPath, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); indexWriter.deleteAll(); indexWriter.setMergeFactor(INDEX_MERGE_FACTOR); // for better performance indexWriter.setRAMBufferSizeMB(ramBufferSizeMB);// for better performance set to 48MB (see lucene docu 'how to make indexing faster") log.info("IndexWriter config RAMBufferSizeMB=" + indexWriter.getRAMBufferSizeMB()); indexWriter.setUseCompoundFile(useCompoundFile); // for better performance (see lucene docu 'how to make indexing faster") log.info("IndexWriter config UseCompoundFile=" + indexWriter.getUseCompoundFile()); // Create IndexWriterWorker log.info("Running with " + numberIndexWriter + " IndexerWriterWorker"); indexWriterWorkers = new IndexWriterWorker[numberIndexWriter]; final Directory[] partIndexDirs = new Directory[numberIndexWriter]; for (int i = 0; i < numberIndexWriter; i++) { final IndexWriterWorker indexWriterWorker = new IndexWriterWorker(i, tempIndexDir, this); indexWriterWorkers[i] = indexWriterWorker; indexWriterWorkers[i].start(); partIndexDirs[i] = indexWriterWorkers[i].getIndexDir(); } final SearchResourceContext searchResourceContext = new SearchResourceContext(); log.info("doIndex start. OlatFullIndexer with Debug output"); mainIndexer.doIndex(searchResourceContext, null /* no parent */, this); log.info("Wait until every folder indexer is finished"); DBFactory.getInstance().commitAndCloseSession(); // check if every folder indexer is finished max waiting-time 10Min (=waitingCount-limit = 60) int waitingCount = 0; final int MAX_WAITING_COUNT = 60;// = 10Min while (FolderIndexerWorkerPool.getInstance().isIndexerRunning() && (waitingCount++ < MAX_WAITING_COUNT)) { Thread.sleep(10000); } if (waitingCount >= MAX_WAITING_COUNT) { log.info("Finished with max waiting time!"); } log.info("Set Finish-flag for each indexWriterWorkers"); // Set Finish-flag for (int i = 0; i < numberIndexWriter; i++) { indexWriterWorkers[i].finishIndexing(); } log.info("Wait until every indexworker is finished"); // check if every indexworker is finished max waiting-time 10Min (=waitingCount-limit = 60) waitingCount = 0; while (!areIndexingDone() && (waitingCount++ < MAX_WAITING_COUNT)) { Thread.sleep(10000); } if (waitingCount >= MAX_WAITING_COUNT) { log.info("Finished with max waiting time!"); } // Merge all partIndex DBFactory.getInstance().commitAndCloseSession(); if (partIndexDirs.length > 0) { log.info("Start merging part Indexes"); indexWriter.addIndexesNoOptimize(partIndexDirs); log.info("Added all part Indexes"); } fullIndexerStatus.setIndexSize(indexWriter.maxDoc()); indexWriter.optimize(); indexWriter.close(); } catch (final IOException e) { e.printStackTrace(); log.warn("Can not create IndexWriter, indexname=" + tempIndexPath, e); } finally { DBFactory.getInstance().commitAndCloseSession(); log.debug("doIndex: commit & close session"); } }