org.apache.lucene.search.similarities.ClassicSimilarity Java Exaples

Source File: TestBooleanQueryVisitSubscorers.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new MockAnalyzer(random());
  dir = newDirectory();
  IndexWriterConfig config = newIndexWriterConfig(analyzer);
  config.setMergePolicy(newLogMergePolicy()); // we will use docids to validate
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
  writer.addDocument(doc("lucene", "lucene is a very popular search engine library"));
  writer.addDocument(doc("solr", "solr is a very popular search server and is using lucene"));
  writer.addDocument(doc("nutch", "nutch is an internet search engine with web crawler and is using lucene and hadoop"));
  reader = writer.getReader();
  writer.close();
  // we do not use newSearcher because the assertingXXX layers break
  // the toString representations we are relying on
  // TODO: clean that up
  searcher = new IndexSearcher(reader);
  searcher.setSimilarity(new ClassicSimilarity());
  scorerSearcher = new ScorerIndexSearcher(reader);
  scorerSearcher.setSimilarity(new CountingSimilarity());
}

Source File: SORecommender.java From scava with Eclipse Public License 2.0

6 votes

public TopDocs executeQuery(org.apache.lucene.search.Query query) throws IOException, ParseException {
	Directory indexDir = FSDirectory.open(Paths.get(INDEX_DIRECTORY));
	try {
		IndexReader reader = DirectoryReader.open(indexDir);
		IndexSearcher searcher = new IndexSearcher(reader);
		if (isBm25 == false) {
			ClassicSimilarity CS = new ClassicSimilarity();
			searcher.setSimilarity(CS);
		}
		TopDocs docs = searcher.search(query, hitsPerPage);
		return docs;
	} catch (Exception e) {
		logger.error(e.getMessage());
		return null;
	}
}

Source File: TestTermScorer.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void setUp() throws Exception {
  super.setUp();
  directory = newDirectory();
  
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory, 
      newIndexWriterConfig(new MockAnalyzer(random()))
      .setMergePolicy(newLogMergePolicy())
      .setSimilarity(new ClassicSimilarity()));
  for (int i = 0; i < values.length; i++) {
    Document doc = new Document();
    doc.add(newTextField(FIELD, values[i], Field.Store.YES));
    writer.addDocument(doc);
  }
  writer.forceMerge(1);
  indexReader = getOnlyLeafReader(writer.getReader());
  writer.close();
  indexSearcher = newSearcher(indexReader, false);
  indexSearcher.setSimilarity(new ClassicSimilarity());
}

Source File: TestSimilarities.java From lucene-solr with Apache License 2.0

6 votes

public void testNonStandardSimilarity() throws Exception {

    try (Monitor monitor = newMonitor()) {
      monitor.register(new MonitorQuery("1", MonitorTestBase.parse("test")));

      Similarity similarity = new ClassicSimilarity() {
        @Override
        public float tf(float freq) {
          return 1000f;
        }
      };

      Document doc = new Document();
      doc.add(newTextField("field", "this is a test", Field.Store.NO));

      MatchingQueries<ScoringMatch> standard = monitor.match(doc, ScoringMatch.matchWithSimilarity(new ClassicSimilarity()));
      MatchingQueries<ScoringMatch> withSim = monitor.match(doc, ScoringMatch.matchWithSimilarity(similarity));

      float standScore = standard.getMatches().iterator().next().getScore();
      float simScore = withSim.getMatches().iterator().next().getScore();
      assertEquals(standScore, simScore / 1000, 0.1f);
    }
  }

Source File: TestValueSources.java From lucene-solr with Apache License 2.0

6 votes

public void testNorm() throws Exception {
  Similarity saved = searcher.getSimilarity();
  try {
    // no norm field (so agnostic to indexed similarity)
    searcher.setSimilarity(new ClassicSimilarity());
    ValueSource vs = new NormValueSource("byte");
    assertHits(new FunctionQuery(vs), new float[] { 1f, 1f });

    // regardless of whether norms exist, value source exists == 0
    assertAllExist(vs);

    vs = new NormValueSource("text");
    assertAllExist(vs);
    
  } finally {
    searcher.setSimilarity(saved);
  }
}

Source File: TestValueSources.java From lucene-solr with Apache License 2.0

6 votes

public void testTF() throws Exception {
  Similarity saved = searcher.getSimilarity();
  try {
    // no norm field (so agnostic to indexed similarity)
    searcher.setSimilarity(new ClassicSimilarity());

    ValueSource vs = new TFValueSource("bogus", "bogus", "text", new BytesRef("test"));
    assertHits(new FunctionQuery(vs), 
               new float[] { (float)Math.sqrt(3d), (float)Math.sqrt(1d) });
    assertAllExist(vs);
               
    vs = new TFValueSource("bogus", "bogus", "string", new BytesRef("bar"));
    assertHits(new FunctionQuery(vs), new float[] { 0f, 1f });
    assertAllExist(vs);
    
    // regardless of whether norms exist, value source exists == 0
    vs = new TFValueSource("bogus", "bogus", "bogus", new BytesRef("bogus"));
    assertHits(new FunctionQuery(vs), new float[] { 0F, 0F });
    assertAllExist(vs);

  } finally {
    searcher.setSimilarity(saved);
  }
}

Source File: TestPayloadSpanUtil.java From lucene-solr with Apache License 2.0

6 votes

public void testPayloadSpanUtil() throws Exception {
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
      newIndexWriterConfig(new PayloadAnalyzer()).setSimilarity(new ClassicSimilarity()));

  Document doc = new Document();
  doc.add(newTextField(FIELD, "xx rr yy mm  pp", Field.Store.YES));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  IndexSearcher searcher = newSearcher(reader);

  PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext());

  Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(FIELD, "rr")));
  if(VERBOSE) {
    System.out.println("Num payloads:" + payloads.size());
    for (final byte [] bytes : payloads) {
      System.out.println(new String(bytes, StandardCharsets.UTF_8));
    }
  }
  reader.close();
  directory.close();
}

Source File: TestTaxonomyFacetCounts.java From lucene-solr with Apache License 2.0

6 votes

public void testReallyNoNormsForDrillDown() throws Exception {
  Directory dir = newDirectory();
  Directory taxoDir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
  iwc.setSimilarity(new PerFieldSimilarityWrapper() {
      final Similarity sim = new ClassicSimilarity();

      @Override
      public Similarity get(String name) {
        assertEquals("field", name);
        return sim;
      }
    });
  TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
  FacetsConfig config = new FacetsConfig();

  Document doc = new Document();
  doc.add(newTextField("field", "text", Field.Store.NO));
  doc.add(new FacetField("a", "path"));
  writer.addDocument(config.build(taxoWriter, doc));
  writer.close();
  IOUtils.close(taxoWriter, dir, taxoDir);
}

Source File: TestSweetSpotSimilarityFactory.java From lucene-solr with Apache License 2.0

5 votes

/** default parameters */
public void testDefaults() throws Exception {
  SweetSpotSimilarity sim = getSimilarity("text", SweetSpotSimilarity.class);

  // SSS tf w/defaults should behave just like DS
  ClassicSimilarity d = new ClassicSimilarity();
  for (int i = 0; i <=1000; i++) {
    assertEquals("tf: i="+i, d.tf(i), sim.tf(i), 0.0F);
  }

  // default norm sanity check
  assertEquals("norm 1",  1.00F, computeNorm(sim, 1),  0.0F);
  assertEquals("norm 4",  0.50F, computeNorm(sim, 4),  0.0F);
  assertEquals("norm 16", 0.25F, computeNorm(sim, 16), 0.0F);
}

Source File: TestQueryRescorer.java From lucene-solr with Apache License 2.0

5 votes

private IndexSearcher getSearcher(IndexReader r) {
  IndexSearcher searcher = newSearcher(r);

  // We rely on more tokens = lower score:
  searcher.setSimilarity(new ClassicSimilarity());

  return searcher;
}

Source File: TestBooleanQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testNullOrSubScorer() throws Throwable {
  Directory dir = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  doc.add(newTextField("field", "a b c d", Field.Store.NO));
  w.addDocument(doc);

  IndexReader r = w.getReader();
  IndexSearcher s = newSearcher(r);
  // this test relies upon coord being the default implementation,
  // otherwise scores are different!
  s.setSimilarity(new ClassicSimilarity());

  BooleanQuery.Builder q = new BooleanQuery.Builder();
  q.add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD);

  // PhraseQuery w/ no terms added returns a null scorer
  PhraseQuery pq = new PhraseQuery("field", new String[0]);
  q.add(pq, BooleanClause.Occur.SHOULD);
  assertEquals(1, s.search(q.build(), 10).totalHits.value);

  // A required clause which returns null scorer should return null scorer to
  // IndexSearcher.
  q = new BooleanQuery.Builder();
  pq = new PhraseQuery("field", new String[0]);
  q.add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD);
  q.add(pq, BooleanClause.Occur.MUST);
  assertEquals(0, s.search(q.build(), 10).totalHits.value);

  DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(
      Arrays.asList(new TermQuery(new Term("field", "a")), pq),
      1.0f);
  assertEquals(1, s.search(dmq, 10).totalHits.value);

  r.close();
  w.close();
  dir.close();
}

Source File: TestPhraseQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testSlopScoring() throws IOException {
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory, 
      newIndexWriterConfig(new MockAnalyzer(random()))
        .setMergePolicy(newLogMergePolicy())
        .setSimilarity(new BM25Similarity()));

  Document doc = new Document();
  doc.add(newTextField("field", "foo firstname lastname foo", Field.Store.YES));
  writer.addDocument(doc);
  
  Document doc2 = new Document();
  doc2.add(newTextField("field", "foo firstname zzz lastname foo", Field.Store.YES));
  writer.addDocument(doc2);
  
  Document doc3 = new Document();
  doc3.add(newTextField("field", "foo firstname zzz yyy lastname foo", Field.Store.YES));
  writer.addDocument(doc3);
  
  IndexReader reader = writer.getReader();
  writer.close();

  IndexSearcher searcher = newSearcher(reader);
  searcher.setSimilarity(new ClassicSimilarity());
  PhraseQuery query = new PhraseQuery(Integer.MAX_VALUE, "field", "firstname", "lastname");
  ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
  assertEquals(3, hits.length);
  // Make sure that those matches where the terms appear closer to
  // each other get a higher score:
  assertEquals(1.0, hits[0].score, 0.01);
  assertEquals(0, hits[0].doc);
  assertEquals(0.63, hits[1].score, 0.01);
  assertEquals(1, hits[1].doc);
  assertEquals(0.47, hits[2].score, 0.01);
  assertEquals(2, hits[2].doc);
  QueryUtils.check(random(), query,searcher);
  reader.close();
  directory.close();
}

Source File: TestSweetSpotSimilarityFactory.java From lucene-solr with Apache License 2.0

5 votes

/** baseline with parameters */
public void testBaselineParameters() throws Exception {
  SweetSpotSimilarity sim = getSimilarity("text_baseline", 
                                          SweetSpotSimilarity.class);
  
  ClassicSimilarity d = new ClassicSimilarity();

  // constant up to 6
  for (int i = 1; i <=6; i++) {
    assertEquals("tf i="+i, 1.5F, sim.tf(i), 0.0F);
  }
  // less then default sim above 6
  for (int i = 6; i <=1000; i++) {
    assertTrue("tf: i="+i+" : s="+sim.tf(i)+
               " < d="+d.tf(i),
               sim.tf(i) < d.tf(i));
  }

  // norms: plateau from 3-5
  assertEquals("norm 1 == 7", 
               computeNorm(sim, 1), computeNorm(sim, 7),  0.0F);
  assertEquals("norm 2 == 6",  
               computeNorm(sim, 1), computeNorm(sim, 7),  0.0F);
  assertEquals("norm 3",  1.00F, computeNorm(sim, 3),  0.0F);
  assertEquals("norm 4",  1.00F, computeNorm(sim, 4),  0.0F);
  assertEquals("norm 5",  1.00F, computeNorm(sim, 5),  0.0F);
  assertTrue("norm 6 too high: " + computeNorm(sim, 6),
             computeNorm(sim, 6) < 1.0F);
  assertTrue("norm 7 higher then norm 6", 
             computeNorm(sim, 7) < computeNorm(sim, 6));
  assertEquals("norm 20", 0.25F, computeNorm(sim, 20), 0.0F);
}

Source File: LtrQueryTests.java From elasticsearch-learning-to-rank with Apache License 2.0

5 votes

@Before
public void setupIndex() throws IOException {
    dirUnderTest = newDirectory();
    List<Similarity> sims = Arrays.asList(
            new ClassicSimilarity(),
            new SweetSpotSimilarity(), // extends Classic
            new BM25Similarity(),
            new LMDirichletSimilarity(),
            new BooleanSimilarity(),
            new LMJelinekMercerSimilarity(0.2F),
            new AxiomaticF3LOG(0.5F, 10),
            new DFISimilarity(new IndependenceChiSquared()),
            new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()),
            new IBSimilarity(new DistributionLL(), new LambdaDF(), new NormalizationH3())
        );
    similarity = sims.get(random().nextInt(sims.size()));

    indexWriterUnderTest = new RandomIndexWriter(random(), dirUnderTest, newIndexWriterConfig().setSimilarity(similarity));
    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newStringField("id", "" + i, Field.Store.YES));
        doc.add(newField("field", docs[i], Store.YES));
        indexWriterUnderTest.addDocument(doc);
    }
    indexWriterUnderTest.commit();
    indexWriterUnderTest.forceMerge(1);
    indexWriterUnderTest.flush();


    indexReaderUnderTest = indexWriterUnderTest.getReader();
    searcherUnderTest = newSearcher(indexReaderUnderTest);
    searcherUnderTest.setSimilarity(similarity);
}

Source File: LuceneTermQueryBuilderTest.java From querqy with Apache License 2.0

5 votes

@Test
public void testThatQueryUsesTermButNoFieldBoost() throws Exception {

    Analyzer analyzer = new StandardAnalyzer();

    Directory directory = new ByteBuffersDirectory();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new ClassicSimilarity());
    IndexWriter indexWriter = new IndexWriter(directory, config);


    TestUtil.addNumDocsWithTextField("f1", "v1 v1", indexWriter, 4);
    TestUtil.addNumDocsWithTextField("f1", "v2", indexWriter, 1);

    indexWriter.close();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    indexSearcher.setSimilarity(new ClassicSimilarity());


    final TermQuery termQuery = new LuceneTermQueryBuilder()
            .createTermQuery(new Term("f1", "v1"), new ConstantFieldBoost(3f));
    final Term term = termQuery.getTerm();
    assertEquals("f1", term.field());
    assertEquals("v1", term.text());

    TopDocs topDocs = indexSearcher.search(termQuery, 10);

    final Weight weight = termQuery.createWeight(indexSearcher, ScoreMode.COMPLETE, 4.5f);
    final Explanation explain = weight.explain(indexReader.getContext().leaves().get(0), topDocs.scoreDocs[0].doc);

    String explainText = explain.toString();

    assertTrue(explainText.contains("4.5 = boost")); // 4.5 (query) but ignore field boost
    assertTrue(explainText.contains("4 = docFreq")); // 4 * v1
    assertTrue(explainText.contains("2.0 = freq")); // 2 * v1 in field
}

Source File: DocFreq.java From lumongo with Apache License 2.0

5 votes

public DocFreq(IndexReader indexReader, String field) {
	this.indexReader = indexReader;
	this.field = field;
	this.docFreqMap = new HashMap<>();
	this.similarity = new ClassicSimilarity();
	this.numDocs = indexReader.numDocs();
}

Source File: LumongoSegment.java From lumongo with Apache License 2.0

5 votes

private PerFieldSimilarityWrapper getSimilarity(final QueryWithFilters queryWithFilters) {
	return new PerFieldSimilarityWrapper() {
		@Override
		public Similarity get(String name) {

			AnalyzerSettings analyzerSettings = indexConfig.getAnalyzerSettingsForIndexField(name);
			AnalyzerSettings.Similarity similarity = AnalyzerSettings.Similarity.BM25;
			if (analyzerSettings != null) {
				similarity = analyzerSettings.getSimilarity();
			}

			AnalyzerSettings.Similarity fieldSimilarityOverride = queryWithFilters.getFieldSimilarityOverride(name);
			if (fieldSimilarityOverride != null) {
				similarity = fieldSimilarityOverride;
			}

			if (AnalyzerSettings.Similarity.TFIDF.equals(similarity)) {
				return new ClassicSimilarity();
			}
			else if (AnalyzerSettings.Similarity.BM25.equals(similarity)) {
				return new BM25Similarity();
			}
			else if (AnalyzerSettings.Similarity.CONSTANT.equals(similarity)) {
				return new ConstantSimilarity();
			}
			else if (AnalyzerSettings.Similarity.TF.equals(similarity)) {
				return new TFSimilarity();
			}
			else {
				throw new RuntimeException("Unknown similarity type <" + similarity + ">");
			}
		}
	};
}

Source File: TestFuzzyQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testSingleQueryExactMatchScoresHighest() throws Exception {
  //See issue LUCENE-329 - IDF shouldn't wreck similarity ranking 
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smythe", writer);
  addDoc("smdssasd", writer);

  IndexReader reader = writer.getReader();
  IndexSearcher searcher = newSearcher(reader);
  searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework
  writer.close();
  String searchTerms[] = { "smith", "smythe", "smdssasd" };
  for (String searchTerm : searchTerms) {
    FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1);
    ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
    Document bestDoc = searcher.doc(hits[0].doc);
    assertTrue(hits.length > 0);
    String topMatch = bestDoc.get("field");
    assertEquals(searchTerm, topMatch);
    if (hits.length > 1) {
      Document worstDoc = searcher.doc(hits[hits.length - 1].doc);
      String worstMatch = worstDoc.get("field");
      assertNotSame(searchTerm, worstMatch);
    }
  }
  reader.close();
  directory.close();
}

Source File: TestMemoryIndex.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testFreezeAPI() {

  MemoryIndex mi = new MemoryIndex();
  mi.addField("f1", "some text", analyzer);

  assertThat(mi.search(new MatchAllDocsQuery()), not(is(0.0f)));
  assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));

  // check we can add a new field after searching
  mi.addField("f2", "some more text", analyzer);
  assertThat(mi.search(new TermQuery(new Term("f2", "some"))), not(is(0.0f)));

  // freeze!
  mi.freeze();

  RuntimeException expected = expectThrows(RuntimeException.class, () -> {
    mi.addField("f3", "and yet more", analyzer);
  });
  assertThat(expected.getMessage(), containsString("frozen"));

  expected = expectThrows(RuntimeException.class, () -> {
    mi.setSimilarity(new BM25Similarity(1, 1));
  });
  assertThat(expected.getMessage(), containsString("frozen"));

  assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));

  mi.reset();
  mi.addField("f1", "wibble", analyzer);
  assertThat(mi.search(new TermQuery(new Term("f1", "some"))), is(0.0f));
  assertThat(mi.search(new TermQuery(new Term("f1", "wibble"))), not(is(0.0f)));

  // check we can set the Similarity again
  mi.setSimilarity(new ClassicSimilarity());

}

Source File: SearchImpl.java From lucene-solr with Apache License 2.0

5 votes

private Similarity createSimilarity(SimilarityConfig config) {
  Similarity similarity;

  if (config.isUseClassicSimilarity()) {
    ClassicSimilarity tfidf = new ClassicSimilarity();
    tfidf.setDiscountOverlaps(config.isDiscountOverlaps());
    similarity = tfidf;
  } else {
    BM25Similarity bm25 = new BM25Similarity(config.getK1(), config.getB());
    bm25.setDiscountOverlaps(config.isDiscountOverlaps());
    similarity = bm25;
  }

  return similarity;
}

Source File: TestValueSources.java From lucene-solr with Apache License 2.0

5 votes

public void testIDF() throws Exception {
  Similarity saved = searcher.getSimilarity();
  try {
    searcher.setSimilarity(new ClassicSimilarity());
    ValueSource vs = new IDFValueSource("bogus", "bogus", "text", new BytesRef("test"));
    assertHits(new FunctionQuery(vs), new float[] { 1.0f, 1.0f });
    assertAllExist(vs);
  } finally {
    searcher.setSimilarity(saved);
  }
}

Source File: TestPayloadScoreQuery.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testNestedNearQuery() throws Exception {

  // (one OR hundred) NEAR (twenty two) ~ 1
  //  2    4        4    4
  // one hundred twenty two
  // two hundred twenty two

  SpanNearQuery q = new SpanNearQuery(new SpanQuery[]{
      new SpanOrQuery(new SpanTermQuery(new Term("field", "one")), new SpanTermQuery(new Term("field", "hundred"))),
      new SpanNearQuery(new SpanQuery[]{
          new SpanTermQuery(new Term("field", "twenty")),
          new SpanTermQuery(new Term("field", "two"))
      }, 0, true)
  }, 1, true);

  // check includeSpanScore makes a difference here
  searcher.setSimilarity(new ClassicSimilarity());
  try {
    checkQuery(q, new MaxPayloadFunction(), new int[]{ 122, 222 }, new float[]{ 20.901256561279297f, 17.06580352783203f });
    checkQuery(q, new MinPayloadFunction(), new int[]{ 222, 122 }, new float[]{ 17.06580352783203f, 10.450628280639648f });
    checkQuery(q, new AveragePayloadFunction(), new int[] { 122, 222 }, new float[]{ 19.15948486328125f, 17.06580352783203f });
    checkQuery(q, new MaxPayloadFunction(), false, new int[]{122, 222}, new float[]{4.0f, 4.0f});
    checkQuery(q, new MinPayloadFunction(), false, new int[]{222, 122}, new float[]{4.0f, 2.0f});
    checkQuery(q, new AveragePayloadFunction(), false, new int[]{222, 122}, new float[]{4.0f, 3.666666f});
  }
  finally {
    searcher.setSimilarity(similarity);
  }

}

Source File: TestMinShouldMatch2.java From lucene-solr with Apache License 2.0

5 votes

@BeforeClass
public static void beforeClass() throws Exception {
  dir = newDirectory();
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
  final int numDocs = atLeast(300);
  for (int i = 0; i < numDocs; i++) {
    Document doc = new Document();
    
    addSome(doc, alwaysTerms);
    
    if (random().nextInt(100) < 90) {
      addSome(doc, commonTerms);
    }
    if (random().nextInt(100) < 50) {
      addSome(doc, mediumTerms);
    }
    if (random().nextInt(100) < 10) {
      addSome(doc, rareTerms);
    }
    iw.addDocument(doc);
  }
  iw.forceMerge(1);
  iw.close();
  r = DirectoryReader.open(dir);
  reader = getOnlyLeafReader(r);
  searcher = new IndexSearcher(reader);
  searcher.setSimilarity(new ClassicSimilarity());
}

Source File: TestElevationComparator.java From lucene-solr with Apache License 2.0

5 votes

public void testSorting() throws Throwable {
  Directory directory = newDirectory();
  IndexWriter writer = new IndexWriter(
      directory,
      newIndexWriterConfig(new MockAnalyzer(random())).
          setMaxBufferedDocs(2).
          setMergePolicy(newLogMergePolicy(1000)).
          setSimilarity(new ClassicSimilarity())
  );
  writer.addDocument(adoc(new String[] {"id", "a", "title", "ipod", "str_s", "a"}));
  writer.addDocument(adoc(new String[] {"id", "b", "title", "ipod ipod", "str_s", "b"}));
  writer.addDocument(adoc(new String[] {"id", "c", "title", "ipod ipod ipod", "str_s","c"}));
  writer.addDocument(adoc(new String[] {"id", "x", "title", "boosted", "str_s", "x"}));
  writer.addDocument(adoc(new String[] {"id", "y", "title", "boosted boosted", "str_s","y"}));
  writer.addDocument(adoc(new String[] {"id", "z", "title", "boosted boosted boosted","str_s", "z"}));

  IndexReader r = DirectoryReader.open(writer);
  writer.close();

  IndexSearcher searcher = newSearcher(r);
  searcher.setSimilarity(new BM25Similarity());

  runTest(searcher, true);
  runTest(searcher, false);

  r.close();
  directory.close();
}

Source File: TestQueryRescorer.java From lucene-solr with Apache License 2.0

4 votes

public static IndexWriterConfig newIndexWriterConfig() {
  // We rely on more tokens = lower score:
  return LuceneTestCase.newIndexWriterConfig().setSimilarity(new ClassicSimilarity());
}

Source File: SweetSpotSimilarityTest.java From lucene-solr with Apache License 2.0

4 votes

public void testSweetSpotTf() {

  SweetSpotSimilarity ss = new SweetSpotSimilarity();

  TFIDFSimilarity d = new ClassicSimilarity();
  TFIDFSimilarity s = ss;
  
  // tf equal

  ss.setBaselineTfFactors(0.0f, 0.0f);

  for (int i = 1; i < 1000; i++) {
    assertEquals("tf: i="+i,
                 d.tf(i), s.tf(i), 0.0f);
  }

  // tf higher

  ss.setBaselineTfFactors(1.0f, 0.0f);

  for (int i = 1; i < 1000; i++) {
    assertTrue("tf: i="+i+" : d="+d.tf(i)+
               " < s="+s.tf(i),
               d.tf(i) < s.tf(i));
  }

  // tf flat

  ss.setBaselineTfFactors(1.0f, 6.0f);
  for (int i = 1; i <=6; i++) {
    assertEquals("tf flat1: i="+i, 1.0f, s.tf(i), 0.0f);
  }
  ss.setBaselineTfFactors(2.0f, 6.0f);
  for (int i = 1; i <=6; i++) {
    assertEquals("tf flat2: i="+i, 2.0f, s.tf(i), 0.0f);
  }
  for (int i = 6; i <=1000; i++) {
    assertTrue("tf: i="+i+" : s="+s.tf(i)+
               " < d="+d.tf(i),
               s.tf(i) < d.tf(i));
  }

  // stupidity
  assertEquals("tf zero", 0.0f, s.tf(0), 0.0f);
}

Source File: ClassicSimilarityFactory.java From lucene-solr with Apache License 2.0

4 votes

@Override
public Similarity getSimilarity() {
  ClassicSimilarity sim = new ClassicSimilarity();
  sim.setDiscountOverlaps(discountOverlaps);
  return sim;
}

Source File: TestBoolean2.java From lucene-solr with Apache License 2.0

4 votes

@Test
public void testRandomQueries() throws Exception {
  String[] vals = {"w1","w2","w3","w4","w5","xx","yy","zzz"};

  int tot=0;

  BooleanQuery q1 = null;
  try {

    // increase number of iterations for more complete testing
    int num = atLeast(3);
    for (int i=0; i<num; i++) {
      int level = random().nextInt(3);
      q1 = randBoolQuery(new Random(random().nextLong()), random().nextBoolean(), level, field, vals, null).build();
      
      // Can't sort by relevance since floating point numbers may not quite
      // match up.
      Sort sort = Sort.INDEXORDER;

      QueryUtils.check(random(), q1,searcher); // baseline sim
      try {
        // a little hackish, QueryUtils.check is too costly to do on bigSearcher in this loop.
        searcher.setSimilarity(bigSearcher.getSimilarity()); // random sim
        QueryUtils.check(random(), q1, searcher);
      } finally {
        searcher.setSimilarity(new ClassicSimilarity()); // restore
      }

      // check diff (randomized) scorers (from AssertingSearcher) produce the same results
      TopFieldCollector collector = TopFieldCollector.create(sort, 1000, 1);
      searcher.search(q1, collector);
      ScoreDoc[] hits1 = collector.topDocs().scoreDocs;
      collector = TopFieldCollector.create(sort, 1000, 1);
      searcher.search(q1, collector);
      ScoreDoc[] hits2 = collector.topDocs().scoreDocs;
      tot+=hits2.length;
      CheckHits.checkEqual(q1, hits1, hits2);

      BooleanQuery.Builder q3 = new BooleanQuery.Builder();
      q3.add(q1, BooleanClause.Occur.SHOULD);
      q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD);
      assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, bigSearcher.count(q3.build()));

      // test diff (randomized) scorers produce the same results on bigSearcher as well
      collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);
      bigSearcher.search(q1, collector);
      hits1 = collector.topDocs().scoreDocs;
      collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);
      bigSearcher.search(q1, collector);
      hits2 = collector.topDocs().scoreDocs;
      CheckHits.checkEqual(q1, hits1, hits2);
      
    }

  } catch (Exception e) {
    // For easier debugging
    System.out.println("failed query: " + q1);
    throw e;
  }

  // System.out.println("Total hits:"+tot);
}

Source File: TestComplexExplanations.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void setUp() throws Exception {
  super.setUp();
  // TODO: switch to BM25?
  searcher.setSimilarity(new ClassicSimilarity());
}

Source File: TestClassicSimilarityFactory.java From lucene-solr with Apache License 2.0

4 votes

/** Classic w/ default parameters */
public void testDefaults() throws Exception {
  ClassicSimilarity sim = getSimilarity("text", ClassicSimilarity.class);
  assertEquals(true, sim.getDiscountOverlaps());
}

org.apache.lucene.search.similarities.ClassicSimilarity Java Examples