org.apache.lucene.index.IndexWriterConfig#setCodec

Source File: TestPerFieldPostingsFormat2.java From lucene-solr with Apache License 2.0

6 votes

private void doTestMixedPostings(Codec codec) throws Exception {
  Directory dir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
  iwc.setCodec(codec);
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
  Document doc = new Document();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  // turn on vectors for the checkindex cross-check
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
  Field idField = new Field("id", "", ft);
  Field dateField = new Field("date", "", ft);
  doc.add(idField);
  doc.add(dateField);
  for (int i = 0; i < 100; i++) {
    idField.setStringValue(Integer.toString(random().nextInt(50)));
    dateField.setStringValue(Integer.toString(random().nextInt(100)));
    iw.addDocument(doc);
  }
  iw.close();
  dir.close(); // checkindex
}

Source File: TestPointQueries.java From lucene-solr with Apache License 2.0

6 votes

public void testWrongNumBytes() throws Exception {
  Directory dir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig();
  iwc.setCodec(getCodec());
  RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
  Document doc = new Document();
  doc.add(new LongPoint("value", Long.MIN_VALUE));
  w.addDocument(doc);

  IndexReader r = w.getReader();

  // no wrapping, else the exc might happen in executor thread:
  IndexSearcher s = new IndexSearcher(r);
  byte[][] point = new byte[1][];
  point[0] = new byte[10];
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    s.count(BinaryPoint.newRangeQuery("value", point, point));
  });
  assertEquals("field=\"value\" was indexed with bytesPerDim=8 but this query has bytesPerDim=10", expected.getMessage());

  IOUtils.close(r, w, dir);
}

Source File: TestPointQueries.java From lucene-solr with Apache License 2.0

6 votes

public void testEmptyPointInSetQuery() throws Exception {
  Directory dir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig();
  iwc.setCodec(getCodec());
  IndexWriter w = new IndexWriter(dir, iwc);

  Document doc = new Document();
  doc.add(new IntPoint("int", 17));
  doc.add(new LongPoint("long", 17L));
  doc.add(new FloatPoint("float", 17.0f));
  doc.add(new DoublePoint("double", 17.0));
  doc.add(new BinaryPoint("bytes", new byte[] {0, 17}));
  w.addDocument(doc);

  IndexReader r = DirectoryReader.open(w);
  IndexSearcher s = newSearcher(r, false);
  assertEquals(0, s.count(IntPoint.newSetQuery("int")));
  assertEquals(0, s.count(LongPoint.newSetQuery("long")));
  assertEquals(0, s.count(FloatPoint.newSetQuery("float")));
  assertEquals(0, s.count(DoublePoint.newSetQuery("double")));
  assertEquals(0, s.count(BinaryPoint.newSetQuery("bytes")));

  w.close();
  r.close();
  dir.close();
}

Source File: TestSuggestField.java From lucene-solr with Apache License 2.0

6 votes

static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set<String> suggestFields) {
  IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
  iwc.setMergePolicy(newLogMergePolicy());
  Codec filterCodec = new Lucene86Codec() {
    CompletionPostingsFormat.FSTLoadMode fstLoadMode =
        RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
    PostingsFormat postingsFormat = new Completion84PostingsFormat(fstLoadMode);

    @Override
    public PostingsFormat getPostingsFormatForField(String field) {
      if (suggestFields.contains(field)) {
        return postingsFormat;
      }
      return super.getPostingsFormatForField(field);
    }
  };
  iwc.setCodec(filterCodec);
  return iwc;
}

Source File: Blur024CodecTest.java From incubator-retired-blur with Apache License 2.0

6 votes

@Test
public void testDocValuesFormat() throws IOException {
  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf.setCodec(new Blur024Codec());
  IndexWriter writer = new IndexWriter(directory, conf);

  Document doc = new Document();
  doc.add(new StringField("f", "v", Store.YES));
  doc.add(new SortedDocValuesField("f", new BytesRef("v")));
  writer.addDocument(doc);

  writer.close();

  DirectoryReader reader = DirectoryReader.open(directory);
  AtomicReaderContext context = reader.leaves().get(0);
  AtomicReader atomicReader = context.reader();
  SortedDocValues sortedDocValues = atomicReader.getSortedDocValues("f");
  assertTrue(sortedDocValues.getClass().getName().startsWith(DiskDocValuesProducer.class.getName()));

  reader.close();
}

Source File: TestPointQueries.java From lucene-solr with Apache License 2.0

6 votes

public void testBasicMultiValueMultiDimPointInSetQuery() throws Exception {
  Directory dir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig();
  iwc.setCodec(getCodec());
  IndexWriter w = new IndexWriter(dir, iwc);

  Document doc = new Document();
  doc.add(new IntPoint("int", 17, 42));
  doc.add(new IntPoint("int", 34, 79));
  w.addDocument(doc);
  IndexReader r = DirectoryReader.open(w);
  IndexSearcher s = newSearcher(r, false);

  assertEquals(0, s.count(newMultiDimIntSetQuery("int", 2, 17, 41)));
  assertEquals(1, s.count(newMultiDimIntSetQuery("int", 2, 17, 42)));
  assertEquals(1, s.count(newMultiDimIntSetQuery("int", 2, 17, 42, 34, 79)));
  assertEquals(1, s.count(newMultiDimIntSetQuery("int", 2, -7, -7, 17, 42)));
  assertEquals(1, s.count(newMultiDimIntSetQuery("int", 2, -7, -7, 34, 79)));
  assertEquals(1, s.count(newMultiDimIntSetQuery("int", 2, 17, 42, -14, -14)));

  assertEquals("int:{-14,-14 17,42}", newMultiDimIntSetQuery("int", 2, 17, 42, -14, -14).toString());

  w.close();
  r.close();
  dir.close();
}

Source File: TestIDVersionPostingsFormat.java From lucene-solr with Apache License 2.0

6 votes

public void testInvalidVersions2() throws IOException {
  Directory dir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
  iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
  RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc, false);
  Document doc = new Document();
  // Long.MAX_VALUE:
  doc.add(new StringAndPayloadField("id", "id", new BytesRef(new byte[] {(byte)0x7f, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff})));
  expectThrows(IllegalArgumentException.class, () -> {
    w.addDocument(doc);
    w.commit(false);
  });
  expectThrows(AlreadyClosedException.class, () -> {
    w.addDocument(doc);
  });

  dir.close();
}

Source File: TestIDVersionPostingsFormat.java From lucene-solr with Apache License 2.0

6 votes

public void testMoreThanOnceInSingleDoc() throws IOException {
  Directory dir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
  iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
  RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc, false);
  Document doc = new Document();
  doc.add(makeIDField("id", 17));
  doc.add(makeIDField("id", 17));
  expectThrows(IllegalArgumentException.class, () -> {
    w.addDocument(doc);
    w.commit(false);
  });

  w.close();
  dir.close();
}

Source File: TestBlockPostingsFormat2.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void setUp() throws Exception {
  super.setUp();
  dir = newFSDirectory(createTempDir("testDFBlockSize"));
  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
  iwc.setCodec(TestUtil.alwaysPostingsFormat(new Lucene50RWPostingsFormat()));
  iw = new RandomIndexWriter(random(), dir, iwc);
  iw.setDoRandomForceMerge(false); // we will ourselves
}

Source File: FilterCacheTest.java From incubator-retired-blur with Apache License 2.0

5 votes

private void writeDocs(FilterCache filterCache, RAMDirectory directory) throws IOException {
  IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer());
  conf.setCodec(new Blur024Codec());
  IndexWriter indexWriter = new IndexWriter(directory, conf);
  int count = 10000;
  addDocs(indexWriter, count);
  indexWriter.close();
}

Source File: TestDocTermOrds.java From lucene-solr with Apache License 2.0

4 votes

public void testRandom() throws Exception {
  Directory dir = newDirectory();

  final int NUM_TERMS = atLeast(20);
  final Set<BytesRef> terms = new HashSet<>();
  while(terms.size() < NUM_TERMS) {
    final String s = TestUtil.randomRealisticUnicodeString(random());
    //final String s = _TestUtil.randomSimpleString(random);
    if (s.length() > 0) {
      terms.add(new BytesRef(s));
    }
  }
  final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
  Arrays.sort(termsArray);
  
  final int NUM_DOCS = atLeast(100);

  IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));

  // Sometimes swap in codec that impls ord():
  if (random().nextInt(10) == 7) {
    // Make sure terms index has ords:
    Codec codec = TestUtil.alwaysPostingsFormat(TestUtil.getPostingsFormatWithOrds(random()));
    conf.setCodec(codec);
  }
  
  final RandomIndexWriter w = new RandomIndexWriter(random(), dir, conf);

  final int[][] idToOrds = new int[NUM_DOCS][];
  final Set<Integer> ordsForDocSet = new HashSet<>();

  for(int id=0;id<NUM_DOCS;id++) {
    Document doc = new Document();

    doc.add(new LegacyIntField("id", id, Field.Store.YES));
    
    final int termCount = TestUtil.nextInt(random(), 0, 20 * RANDOM_MULTIPLIER);
    while(ordsForDocSet.size() < termCount) {
      ordsForDocSet.add(random().nextInt(termsArray.length));
    }
    final int[] ordsForDoc = new int[termCount];
    int upto = 0;
    if (VERBOSE) {
      System.out.println("TEST: doc id=" + id);
    }
    for(int ord : ordsForDocSet) {
      ordsForDoc[upto++] = ord;
      Field field = newStringField("field", termsArray[ord].utf8ToString(), Field.Store.NO);
      if (VERBOSE) {
        System.out.println("  f=" + termsArray[ord].utf8ToString());
      }
      doc.add(field);
    }
    ordsForDocSet.clear();
    Arrays.sort(ordsForDoc);
    idToOrds[id] = ordsForDoc;
    w.addDocument(doc);
  }
  
  final DirectoryReader r = w.getReader();
  w.close();

  if (VERBOSE) {
    System.out.println("TEST: reader=" + r);
  }

  for(LeafReaderContext ctx : r.leaves()) {
    if (VERBOSE) {
      System.out.println("\nTEST: sub=" + ctx.reader());
    }
    verify(ctx.reader(), idToOrds, termsArray, null);
  }

  // Also test top-level reader: its enum does not support
  // ord, so this forces the OrdWrapper to run:
  if (VERBOSE) {
    System.out.println("TEST: top reader");
  }
  LeafReader slowR = SlowCompositeReaderWrapper.wrap(r);
  TestUtil.checkReader(slowR);
  verify(slowR, idToOrds, termsArray, null);

  FieldCache.DEFAULT.purgeByCacheKey(slowR.getCoreCacheHelper().getKey());

  r.close();
  dir.close();
}

Source File: MtasSearchTestConsistency.java From mtas with Apache License 2.0

4 votes

/**
 * Creates the index.
 *
 * @param configFile the config file
 * @param files the files
 * @throws IOException Signals that an I/O exception has occurred.
 */
private static void createIndex(String configFile,
    HashMap<String, String> files) throws IOException {
  // analyzer
  Map<String, String> paramsCharFilterMtas = new HashMap<>();
  paramsCharFilterMtas.put("type", "file");
  Map<String, String> paramsTokenizer = new HashMap<>();
  paramsTokenizer.put("configFile", configFile);
  Analyzer mtasAnalyzer = CustomAnalyzer
      .builder(Paths.get("docker").toAbsolutePath())
      .addCharFilter("mtas", paramsCharFilterMtas)
      .withTokenizer("mtas", paramsTokenizer).build();
  Map<String, Analyzer> analyzerPerField = new HashMap<>();
  analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer);
  PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
      new StandardAnalyzer(), analyzerPerField);
  // indexwriter
  IndexWriterConfig config = new IndexWriterConfig(analyzer);
  config.setUseCompoundFile(false);
  config.setCodec(Codec.forName("MtasCodec"));
  IndexWriter w = new IndexWriter(directory, config);
  // delete
  w.deleteAll();
  // add
  int counter = 0;
  for (Entry<String, String> entry : files.entrySet()) {
    addDoc(w, counter, entry.getKey(), entry.getValue());
    if (counter == 0) {
      w.commit();
    } else {
      addDoc(w, counter, entry.getKey(), entry.getValue());
      addDoc(w, counter, "deletable", entry.getValue());
      w.commit();
      w.deleteDocuments(new Term(FIELD_ID, Integer.toString(counter)));
      w.deleteDocuments(new Term(FIELD_TITLE, "deletable"));
      addDoc(w, counter, entry.getKey(), entry.getValue());
    }
    counter++;
  }
  w.commit();
  // finish
  w.close();
}

Source File: Blur022CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testLargeDocs() throws IOException {
  Random random = new Random();
  Iterable<? extends IndexableField> doc = getLargeDoc(random);
  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur022Codec());
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  writer1.addDocument(doc);
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur022Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  writer2.addDocument(doc);
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    Document document1 = reader1.document(0);
    long t2 = System.nanoTime();
    Document document2 = reader2.document(1);
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + document1.hashCode());
    System.out.println("doc2 " + document2.hashCode());
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }

}

Source File: TestNearest.java From lucene-solr with Apache License 2.0

4 votes

private IndexWriterConfig getIndexWriterConfig() {
  IndexWriterConfig iwc = newIndexWriterConfig();
  iwc.setCodec(Codec.forName("Lucene86"));
  return iwc;
}

Source File: Blur024CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testLargeDocs() throws IOException {
  Random random = new Random();
  Iterable<? extends IndexableField> doc = getLargeDoc(random);
  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur024Codec());
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  writer1.addDocument(doc);
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur024Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  writer2.addDocument(doc);
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    Document document1 = reader1.document(0);
    long t2 = System.nanoTime();
    Document document2 = reader2.document(1);
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + document1.hashCode());
    System.out.println("doc2 " + document2.hashCode());
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }

}

Source File: TestCompressingStoredFieldsFormat.java From lucene-solr with Apache License 2.0

4 votes

public void testDeletePartiallyWrittenFilesIfAbort() throws IOException {
  Directory dir = newDirectory();
  IndexWriterConfig iwConf = newIndexWriterConfig(new MockAnalyzer(random()));
  iwConf.setMaxBufferedDocs(RandomNumbers.randomIntBetween(random(), 2, 30));
  iwConf.setCodec(getCodec());
  // disable CFS because this test checks file names
  iwConf.setMergePolicy(newLogMergePolicy(false));
  iwConf.setUseCompoundFile(false);

  // Cannot use RIW because this test wants CFS to stay off:
  IndexWriter iw = new IndexWriter(dir, iwConf);

  final Document validDoc = new Document();
  validDoc.add(new IntPoint("id", 0));
  validDoc.add(new StoredField("id", 0));
  iw.addDocument(validDoc);
  iw.commit();
  
  // make sure that #writeField will fail to trigger an abort
  final Document invalidDoc = new Document();
  FieldType fieldType = new FieldType();
  fieldType.setStored(true);
  invalidDoc.add(new Field("invalid", fieldType) {
    
    @Override
    public String stringValue() {
      // TODO: really bad & scary that this causes IW to
      // abort the segment!!  We should fix this.
      return null;
    }
    
  });
  
  try {
    iw.addDocument(invalidDoc);
    iw.commit();
  } catch(IllegalArgumentException iae) {
    // expected
    assertEquals(iae, iw.getTragicException());
  }
  // Writer should be closed by tragedy
  assertFalse(iw.isOpen());
  dir.close();
}

Source File: TestPerFieldPostingsFormat2.java From lucene-solr with Apache License 2.0

4 votes

@Test
public void testChangeCodecAndMerge() throws IOException {
  Directory dir = newDirectory();
  if (VERBOSE) {
    System.out.println("TEST: make new index");
  }
  IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
                               .setOpenMode(OpenMode.CREATE).setCodec(new MockCodec());
  iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
  //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
  IndexWriter writer = newWriter(dir, iwconf);

  addDocs(writer, 10);
  writer.commit();
  assertQuery(new Term("content", "aaa"), dir, 10);
  if (VERBOSE) {
    System.out.println("TEST: addDocs3");
  }
  addDocs3(writer, 10);
  writer.commit();
  writer.close();

  assertQuery(new Term("content", "ccc"), dir, 10);
  assertQuery(new Term("content", "aaa"), dir, 10);
  Codec codec = iwconf.getCodec();

  iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
      .setOpenMode(OpenMode.APPEND).setCodec(codec);
  //((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0);
  //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
  iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);

  iwconf.setCodec(new MockCodec()); // uses standard for field content
  writer = newWriter(dir, iwconf);
  // swap in new codec for currently written segments
  if (VERBOSE) {
    System.out.println("TEST: add docs w/ Standard codec for content field");
  }
  addDocs2(writer, 10);
  writer.commit();
  codec = iwconf.getCodec();
  assertEquals(30, writer.getDocStats().maxDoc);
  assertQuery(new Term("content", "bbb"), dir, 10);
  assertQuery(new Term("content", "ccc"), dir, 10);   ////
  assertQuery(new Term("content", "aaa"), dir, 10);

  if (VERBOSE) {
    System.out.println("TEST: add more docs w/ new codec");
  }
  addDocs2(writer, 10);
  writer.commit();
  assertQuery(new Term("content", "ccc"), dir, 10);
  assertQuery(new Term("content", "bbb"), dir, 20);
  assertQuery(new Term("content", "aaa"), dir, 10);
  assertEquals(40, writer.getDocStats().maxDoc);

  if (VERBOSE) {
    System.out.println("TEST: now optimize");
  }
  writer.forceMerge(1);
  assertEquals(40, writer.getDocStats().maxDoc);
  writer.close();
  assertQuery(new Term("content", "ccc"), dir, 10);
  assertQuery(new Term("content", "bbb"), dir, 20);
  assertQuery(new Term("content", "aaa"), dir, 10);

  dir.close();
}

Source File: MtasDocumentIndex.java From inception with Apache License 2.0

4 votes

private synchronized IndexWriter getIndexWriter() throws IOException
{
    if (_indexWriter == null) {
        log.debug("Opening index for project [{}]({})", project.getName(), project.getId());

        OPEN_INDEXES.put(project.getId(), this);
        
        // Initialize and populate the hash maps for the layers and features
        features = schemaService.listAnnotationFeature(project).stream()
                .filter(feat -> feat.getLayer().isEnabled())
                .filter(feat -> feat.isEnabled())
                .collect(Collectors.toList());
        
        // Add the project id to the configuration
        JSONObject jsonParserConfiguration = new JSONObject();
        jsonParserConfiguration.put(PARAM_PROJECT_ID, project.getId());
        
        // Tokenizer parameters
        Map<String, String> tokenizerArguments = new HashMap<>();
        tokenizerArguments.put(ARGUMENT_PARSER, MtasUimaParser.class.getName());
        tokenizerArguments.put(ARGUMENT_PARSER_ARGS, jsonParserConfiguration.toString());
        
        // Build analyzer
        Analyzer mtasAnalyzer = CustomAnalyzer.builder()
                .withTokenizer(MtasTokenizerFactory.class, tokenizerArguments)
                .build();
        
        Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
        analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer);
        
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(),
                analyzerPerField);
        
        // Build IndexWriter
        FileUtils.forceMkdir(getIndexDir());
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        config.setCodec(Codec.forName(MTAS_CODEC_NAME));
        IndexWriter indexWriter = new IndexWriter(FSDirectory.open(getIndexDir().toPath()),
                config);
        
        // Initialize the index
        indexWriter.commit();
        
        // After the index has been initialized, assign the _indexWriter - this is also used
        // by isOpen() to check if the index writer is available.
        _indexWriter = indexWriter;
    }
    
    return _indexWriter;
}

Source File: Blur022CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testSmallDocs() throws IOException {

  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur022Codec());
  Random random1 = new Random(1);
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  for (int i = 0; i < 1000; i++) {
    writer1.addDocument(getSmallDoc(random1));
  }
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1000, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur022Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  Random random2 = new Random(1);
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  for (int i = 0; i < 1000; i++) {
    writer2.addDocument(getSmallDoc(random2));
  }
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2000, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    long hash1 = 0;
    long hash2 = 0;
    for (int d = 0; d < 1000; d++) {
      Document document1 = reader1.document(d);
      hash1 += document1.hashCode();
    }
    long t2 = System.nanoTime();
    for (int d = 0; d < 1000; d++) {
      Document document2 = reader2.document(d + 1000);
      hash2 += document2.hashCode();
    }
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + hash1);
    System.out.println("doc2 " + hash2);
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }
}

Source File: Blur024CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testSmallDocs() throws IOException {

  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur024Codec());
  Random random1 = new Random(1);
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  for (int i = 0; i < 1000; i++) {
    writer1.addDocument(getSmallDoc(random1));
  }
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1000, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur024Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  Random random2 = new Random(1);
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  for (int i = 0; i < 1000; i++) {
    writer2.addDocument(getSmallDoc(random2));
  }
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2000, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    long hash1 = 0;
    long hash2 = 0;
    for (int d = 0; d < 1000; d++) {
      Document document1 = reader1.document(d);
      hash1 += document1.hashCode();
    }
    long t2 = System.nanoTime();
    for (int d = 0; d < 1000; d++) {
      Document document2 = reader2.document(d + 1000);
      hash2 += document2.hashCode();
    }
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + hash1);
    System.out.println("doc2 " + hash2);
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }
}

Java Code Examples for org.apache.lucene.index.IndexWriterConfig#setCodec()