org.apache.lucene.search.similarities.BM25Similarity Java Exaples

Source File: BM25FQuery.java From lucene-solr with Apache License 2.0

6 votes

private BM25FQuery(BM25Similarity similarity, TreeMap<String, FieldAndWeight> fieldAndWeights, BytesRef[] terms) {
  this.similarity = similarity;
  this.fieldAndWeights = fieldAndWeights;
  this.terms = terms;
  int numFieldTerms = fieldAndWeights.size() * terms.length;
  if (numFieldTerms > IndexSearcher.getMaxClauseCount()) {
    throw new IndexSearcher.TooManyClauses();
  }
  this.fieldTerms = new Term[numFieldTerms];
  Arrays.sort(terms);
  int pos = 0;
  for (String field : fieldAndWeights.keySet()) {
    for (BytesRef term : terms) {
      fieldTerms[pos++] = new Term(field, term);
    }
  }

  this.ramBytesUsed = BASE_RAM_BYTES +
      RamUsageEstimator.sizeOfObject(fieldAndWeights) +
      RamUsageEstimator.sizeOfObject(fieldTerms) +
      RamUsageEstimator.sizeOfObject(terms);
}

Source File: KNearestNeighborClassifier.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Creates a {@link KNearestNeighborClassifier}.
 *
 * @param indexReader     the reader on the index to be used for classification
 * @param analyzer       an {@link Analyzer} used to analyze unseen text
 * @param similarity     the {@link Similarity} to be used by the underlying {@link IndexSearcher} or {@code null}
 *                       (defaults to {@link org.apache.lucene.search.similarities.BM25Similarity})
 * @param query          a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
 *                       if all the indexed docs should be used
 * @param k              the no. of docs to select in the MLT results to find the nearest neighbor
 * @param minDocsFreq    {@link MoreLikeThis#minDocFreq} parameter
 * @param minTermFreq    {@link MoreLikeThis#minTermFreq} parameter
 * @param classFieldName the name of the field used as the output for the classifier
 * @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10
 */
public KNearestNeighborClassifier(IndexReader indexReader, Similarity similarity, Analyzer analyzer, Query query, int k, int minDocsFreq,
                                  int minTermFreq, String classFieldName, String... textFieldNames) {
  this.textFieldNames = textFieldNames;
  this.classFieldName = classFieldName;
  this.mlt = new MoreLikeThis(indexReader);
  this.mlt.setAnalyzer(analyzer);
  this.mlt.setFieldNames(textFieldNames);
  this.indexSearcher = new IndexSearcher(indexReader);
  if (similarity != null) {
    this.indexSearcher.setSimilarity(similarity);
  } else {
    this.indexSearcher.setSimilarity(new BM25Similarity());
  }
  if (minDocsFreq > 0) {
    mlt.setMinDocFreq(minDocsFreq);
  }
  if (minTermFreq > 0) {
    mlt.setMinTermFreq(minTermFreq);
  }
  this.query = query;
  this.k = k;
}

Source File: KNearestNeighborClassifierTest.java From lucene-solr with Apache License 2.0

6 votes

@Test
public void testBasicUsage() throws Exception {
  LeafReader leafReader = null;
  try {
    MockAnalyzer analyzer = new MockAnalyzer(random());
    leafReader = getSampleIndex(analyzer);
    checkCorrectClassification(new KNearestNeighborClassifier(leafReader, null, analyzer, null, 1, 0, 0, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
    checkCorrectClassification(new KNearestNeighborClassifier(leafReader, new LMDirichletSimilarity(), analyzer, null, 1, 0, 0, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
    ClassificationResult<BytesRef> resultDS =  checkCorrectClassification(new KNearestNeighborClassifier(leafReader, new BM25Similarity(), analyzer, null, 3, 2, 1, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
    ClassificationResult<BytesRef> resultLMS =  checkCorrectClassification(new KNearestNeighborClassifier(leafReader, new LMDirichletSimilarity(), analyzer, null, 3, 2, 1, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
    assertTrue(resultDS.getScore() != resultLMS.getScore());
  } finally {
    if (leafReader != null) {
      leafReader.close();
    }
  }
}

Source File: TestPhraseQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testSlopScoring() throws IOException {
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory, 
      newIndexWriterConfig(new MockAnalyzer(random()))
        .setMergePolicy(newLogMergePolicy())
        .setSimilarity(new BM25Similarity()));

  Document doc = new Document();
  doc.add(newTextField("field", "foo firstname lastname foo", Field.Store.YES));
  writer.addDocument(doc);
  
  Document doc2 = new Document();
  doc2.add(newTextField("field", "foo firstname zzz lastname foo", Field.Store.YES));
  writer.addDocument(doc2);
  
  Document doc3 = new Document();
  doc3.add(newTextField("field", "foo firstname zzz yyy lastname foo", Field.Store.YES));
  writer.addDocument(doc3);
  
  IndexReader reader = writer.getReader();
  writer.close();

  IndexSearcher searcher = newSearcher(reader);
  searcher.setSimilarity(new ClassicSimilarity());
  PhraseQuery query = new PhraseQuery(Integer.MAX_VALUE, "field", "firstname", "lastname");
  ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
  assertEquals(3, hits.length);
  // Make sure that those matches where the terms appear closer to
  // each other get a higher score:
  assertEquals(1.0, hits[0].score, 0.01);
  assertEquals(0, hits[0].doc);
  assertEquals(0.63, hits[1].score, 0.01);
  assertEquals(1, hits[1].doc);
  assertEquals(0.47, hits[2].score, 0.01);
  assertEquals(2, hits[2].doc);
  QueryUtils.check(random(), query,searcher);
  reader.close();
  directory.close();
}

Source File: LumongoSegment.java From lumongo with Apache License 2.0

5 votes

private PerFieldSimilarityWrapper getSimilarity(final QueryWithFilters queryWithFilters) {
	return new PerFieldSimilarityWrapper() {
		@Override
		public Similarity get(String name) {

			AnalyzerSettings analyzerSettings = indexConfig.getAnalyzerSettingsForIndexField(name);
			AnalyzerSettings.Similarity similarity = AnalyzerSettings.Similarity.BM25;
			if (analyzerSettings != null) {
				similarity = analyzerSettings.getSimilarity();
			}

			AnalyzerSettings.Similarity fieldSimilarityOverride = queryWithFilters.getFieldSimilarityOverride(name);
			if (fieldSimilarityOverride != null) {
				similarity = fieldSimilarityOverride;
			}

			if (AnalyzerSettings.Similarity.TFIDF.equals(similarity)) {
				return new ClassicSimilarity();
			}
			else if (AnalyzerSettings.Similarity.BM25.equals(similarity)) {
				return new BM25Similarity();
			}
			else if (AnalyzerSettings.Similarity.CONSTANT.equals(similarity)) {
				return new ConstantSimilarity();
			}
			else if (AnalyzerSettings.Similarity.TF.equals(similarity)) {
				return new TFSimilarity();
			}
			else {
				throw new RuntimeException("Unknown similarity type <" + similarity + ">");
			}
		}
	};
}

Source File: Lucene.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public Lucene(Path path) throws IOException {
	/* Setup Lucene */
       Directory dir = FSDirectory.open(path);
       // here we are using a standard analyzer, there are a lot of analyzers available to our use.
       Analyzer analyzer = new StandardAnalyzer();
       IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
       //this mode by default overwrites the previous index, not a very good option in real usage
       iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
       iwc.setSimilarity(new BM25Similarity());
       index = new IndexWriter(dir, iwc);
}

Source File: LtrQueryTests.java From elasticsearch-learning-to-rank with Apache License 2.0

5 votes

@Before
public void setupIndex() throws IOException {
    dirUnderTest = newDirectory();
    List<Similarity> sims = Arrays.asList(
            new ClassicSimilarity(),
            new SweetSpotSimilarity(), // extends Classic
            new BM25Similarity(),
            new LMDirichletSimilarity(),
            new BooleanSimilarity(),
            new LMJelinekMercerSimilarity(0.2F),
            new AxiomaticF3LOG(0.5F, 10),
            new DFISimilarity(new IndependenceChiSquared()),
            new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()),
            new IBSimilarity(new DistributionLL(), new LambdaDF(), new NormalizationH3())
        );
    similarity = sims.get(random().nextInt(sims.size()));

    indexWriterUnderTest = new RandomIndexWriter(random(), dirUnderTest, newIndexWriterConfig().setSimilarity(similarity));
    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newStringField("id", "" + i, Field.Store.YES));
        doc.add(newField("field", docs[i], Store.YES));
        indexWriterUnderTest.addDocument(doc);
    }
    indexWriterUnderTest.commit();
    indexWriterUnderTest.forceMerge(1);
    indexWriterUnderTest.flush();


    indexReaderUnderTest = indexWriterUnderTest.getReader();
    searcherUnderTest = newSearcher(indexReaderUnderTest);
    searcherUnderTest.setSimilarity(similarity);
}

Source File: TestBM25SimilarityFactory.java From lucene-solr with Apache License 2.0

5 votes

/** bm25 with parameters */
public void testParameters() throws Exception {
  Similarity sim = getSimilarity("text_params");
  assertEquals(BM25Similarity.class, sim.getClass());
  BM25Similarity bm25 = (BM25Similarity) sim;
  assertEquals(1.2f, bm25.getK1(), 0.01f);
  assertEquals(0.76f, bm25.getB(), 0.01f);
}

Source File: SchemaSimilarityFactory.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Similarity getSimilarity() {
  if (null == core) {
    throw new IllegalStateException("SchemaSimilarityFactory can not be used until SolrCoreAware.inform has been called");
  }
  if (null == similarity) {
    // Need to instantiate lazily, can't do this in inform(SolrCore) because of chicken/egg
    // circular initialization hell with core.getLatestSchema() to lookup defaultSimFromFieldType
    
    Similarity defaultSim = null;
    if (null == defaultSimFromFieldType) {
      // nothing configured, choose a sensible implicit default...
      defaultSim = coreVersion.onOrAfter(Version.LUCENE_8_0_0) ? 
          new BM25Similarity() :
          new LegacyBM25Similarity();
    } else {
      FieldType defSimFT = core.getLatestSchema().getFieldTypeByName(defaultSimFromFieldType);
      if (null == defSimFT) {
        throw new SolrException(ErrorCode.SERVER_ERROR,
                                "SchemaSimilarityFactory configured with " + INIT_OPT + "='" +
                                defaultSimFromFieldType + "' but that <fieldType> does not exist");
                                
      }
      defaultSim = defSimFT.getSimilarity();
      if (null == defaultSim) {
        throw new SolrException(ErrorCode.SERVER_ERROR,
                                "SchemaSimilarityFactory configured with " + INIT_OPT + "='" + 
                                defaultSimFromFieldType +
                                "' but that <fieldType> does not define a <similarity>");
      }
    }
    similarity = new SchemaSimilarity(defaultSim);
  }
  return similarity;
}

Source File: TestQueryRescorer.java From lucene-solr with Apache License 2.0

5 votes

public void testRescoreIsIdempotent() throws Exception {
  Directory dir = newDirectory();
  int numDocs = 100;
  String fieldName = "field";
  IndexReader reader = publishDocs(numDocs, fieldName, dir);

  // Construct a query that will get numDocs hits.
  String wordOne = dictionary.get(0);
  TermQuery termQuery = new TermQuery(new Term(fieldName, wordOne));
  IndexSearcher searcher = getSearcher(reader);
  searcher.setSimilarity(new BM25Similarity());
  TopDocs hits1 = searcher.search(termQuery, numDocs);
  TopDocs hits2 = searcher.search(termQuery, numDocs);

  // Next, use a more specific phrase query that will return different scores
  // from the above term query
  String wordTwo = RandomPicks.randomFrom(random(), dictionary);
  PhraseQuery phraseQuery = new PhraseQuery(1, fieldName, wordOne, wordTwo);

  // rescore, requesting the same hits as topN
  int topN = numDocs;
  TopDocs firstRescoreHits = QueryRescorer.rescore(searcher, hits1, phraseQuery, 2.0, topN);

  // now rescore again, where topN is less than numDocs
  topN = random().nextInt(numDocs-1);
  ScoreDoc[] secondRescoreHits = QueryRescorer.rescore(searcher, hits2, phraseQuery, 2.0, topN).scoreDocs;
  ScoreDoc[] expectedTopNScoreDocs = ArrayUtil.copyOfSubArray(firstRescoreHits.scoreDocs, 0, topN);
  CheckHits.checkEqual(phraseQuery, expectedTopNScoreDocs, secondRescoreHits);

  reader.close();
  dir.close();
}

Source File: TestQueryRescorer.java From lucene-solr with Apache License 2.0

5 votes

public void testRescoreOfASubsetOfHits() throws Exception {
  Directory dir = newDirectory();
  int numDocs = 100;
  String fieldName = "field";
  IndexReader reader = publishDocs(numDocs, fieldName, dir);

  // Construct a query that will get numDocs hits.
  String wordOne = dictionary.get(0);
  TermQuery termQuery = new TermQuery(new Term(fieldName, wordOne));
  IndexSearcher searcher = getSearcher(reader);
  searcher.setSimilarity(new BM25Similarity());
  TopDocs hits = searcher.search(termQuery, numDocs);

  // Next, use a more specific phrase query that will return different scores
  // from the above term query
  String wordTwo = RandomPicks.randomFrom(random(), dictionary);
  PhraseQuery phraseQuery = new PhraseQuery(1, fieldName, wordOne, wordTwo);

  // rescore, requesting a smaller topN
  int topN = random().nextInt(numDocs-1);
  TopDocs phraseQueryHits = QueryRescorer.rescore(searcher, hits, phraseQuery, 2.0, topN);
  assertEquals(topN, phraseQueryHits.scoreDocs.length);

  for (int i = 1; i < phraseQueryHits.scoreDocs.length; i++) {
    assertTrue(phraseQueryHits.scoreDocs[i].score <= phraseQueryHits.scoreDocs[i-1].score);
  }
  reader.close();
  dir.close();
}

Source File: BM25SimilarityProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public BM25SimilarityProvider(@Assisted String name, @Assisted Settings settings) {
    super(name);
    float k1 = settings.getAsFloat("k1", 1.2f);
    float b = settings.getAsFloat("b", 0.75f);
    boolean discountOverlaps = settings.getAsBoolean("discount_overlaps", true);

    this.similarity = new BM25Similarity(k1, b);
    this.similarity.setDiscountOverlaps(discountOverlaps);
}

Source File: TestElevationComparator.java From lucene-solr with Apache License 2.0

5 votes

public void testSorting() throws Throwable {
  Directory directory = newDirectory();
  IndexWriter writer = new IndexWriter(
      directory,
      newIndexWriterConfig(new MockAnalyzer(random())).
          setMaxBufferedDocs(2).
          setMergePolicy(newLogMergePolicy(1000)).
          setSimilarity(new ClassicSimilarity())
  );
  writer.addDocument(adoc(new String[] {"id", "a", "title", "ipod", "str_s", "a"}));
  writer.addDocument(adoc(new String[] {"id", "b", "title", "ipod ipod", "str_s", "b"}));
  writer.addDocument(adoc(new String[] {"id", "c", "title", "ipod ipod ipod", "str_s","c"}));
  writer.addDocument(adoc(new String[] {"id", "x", "title", "boosted", "str_s", "x"}));
  writer.addDocument(adoc(new String[] {"id", "y", "title", "boosted boosted", "str_s","y"}));
  writer.addDocument(adoc(new String[] {"id", "z", "title", "boosted boosted boosted","str_s", "z"}));

  IndexReader r = DirectoryReader.open(writer);
  writer.close();

  IndexSearcher searcher = newSearcher(r);
  searcher.setSimilarity(new BM25Similarity());

  runTest(searcher, true);
  runTest(searcher, false);

  r.close();
  directory.close();
}

Source File: TestMemoryIndex.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testFreezeAPI() {

  MemoryIndex mi = new MemoryIndex();
  mi.addField("f1", "some text", analyzer);

  assertThat(mi.search(new MatchAllDocsQuery()), not(is(0.0f)));
  assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));

  // check we can add a new field after searching
  mi.addField("f2", "some more text", analyzer);
  assertThat(mi.search(new TermQuery(new Term("f2", "some"))), not(is(0.0f)));

  // freeze!
  mi.freeze();

  RuntimeException expected = expectThrows(RuntimeException.class, () -> {
    mi.addField("f3", "and yet more", analyzer);
  });
  assertThat(expected.getMessage(), containsString("frozen"));

  expected = expectThrows(RuntimeException.class, () -> {
    mi.setSimilarity(new BM25Similarity(1, 1));
  });
  assertThat(expected.getMessage(), containsString("frozen"));

  assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));

  mi.reset();
  mi.addField("f1", "wibble", analyzer);
  assertThat(mi.search(new TermQuery(new Term("f1", "some"))), is(0.0f));
  assertThat(mi.search(new TermQuery(new Term("f1", "wibble"))), not(is(0.0f)));

  // check we can set the Similarity again
  mi.setSimilarity(new ClassicSimilarity());

}

Source File: SearchImpl.java From lucene-solr with Apache License 2.0

5 votes

private Similarity createSimilarity(SimilarityConfig config) {
  Similarity similarity;

  if (config.isUseClassicSimilarity()) {
    ClassicSimilarity tfidf = new ClassicSimilarity();
    tfidf.setDiscountOverlaps(config.isDiscountOverlaps());
    similarity = tfidf;
  } else {
    BM25Similarity bm25 = new BM25Similarity(config.getK1(), config.getB());
    bm25.setDiscountOverlaps(config.isDiscountOverlaps());
    similarity = bm25;
  }

  return similarity;
}

Source File: KNearestFuzzyClassifier.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Creates a {@link KNearestFuzzyClassifier}.
 *
 * @param indexReader    the reader on the index to be used for classification
 * @param analyzer       an {@link Analyzer} used to analyze unseen text
 * @param similarity     the {@link Similarity} to be used by the underlying {@link IndexSearcher} or {@code null}
 *                       (defaults to {@link BM25Similarity})
 * @param query          a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
 *                       if all the indexed docs should be used
 * @param k              the no. of docs to select in the MLT results to find the nearest neighbor
 * @param classFieldName the name of the field used as the output for the classifier
 * @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10
 */
public KNearestFuzzyClassifier(IndexReader indexReader, Similarity similarity, Analyzer analyzer, Query query, int k,
                               String classFieldName, String... textFieldNames) {
  this.textFieldNames = textFieldNames;
  this.classFieldName = classFieldName;
  this.analyzer = analyzer;
  this.indexSearcher = new IndexSearcher(indexReader);
  if (similarity != null) {
    this.indexSearcher.setSimilarity(similarity);
  } else {
    this.indexSearcher.setSimilarity(new BM25Similarity());
  }
  this.query = query;
  this.k = k;
}

Source File: TestNonDefinedSimilarityFactory.java From lucene-solr with Apache License 2.0

4 votes

public void testCurrentBM25FromV8() throws Exception {
  // no sys prop set, rely on LATEST
  initCore("solrconfig-basic.xml","schema-tiny.xml");
  BM25Similarity sim = getSimilarity("text", BM25Similarity.class);
  assertEquals(0.75F, sim.getB(), 0.0F);
}

Source File: TestBM25SimilarityFactory.java From lucene-solr with Apache License 2.0

4 votes

/** bm25 with default parameters */
public void test() throws Exception {
  assertEquals(BM25Similarity.class, getSimilarity("text").getClass());
}

Source File: TestPerFieldSimilarity.java From lucene-solr with Apache License 2.0

4 votes

/** test a field that does not exist */
public void testNonexistent() throws Exception {
  Similarity sim = getSimilarity("sdfdsfdsfdswr5fsdfdsfdsfs");
  assertEquals(BM25Similarity.class, sim.getClass());
}

Source File: TestPerFieldSimilarity.java From lucene-solr with Apache License 2.0

4 votes

/** ... and for a dynamic field */
public void testDefaultsDynamic() throws Exception {
  Similarity sim = getSimilarity("text_sim3");
  assertEquals(BM25Similarity.class, sim.getClass());
}

Source File: TestPerFieldSimilarity.java From lucene-solr with Apache License 2.0

4 votes

/** test a field where no similarity is specified */
public void testDefaults() throws Exception {
  Similarity sim = getSimilarity("sim3text");
  assertEquals(BM25Similarity.class, sim.getClass());;
}

Source File: TestLegacyBM25Similarity.java From lucene-solr with Apache License 2.0

4 votes

public void testDefaults() {
  LegacyBM25Similarity legacyBM25Similarity = new LegacyBM25Similarity();
  BM25Similarity bm25Similarity = new BM25Similarity();
  assertEquals(bm25Similarity.getB(), legacyBM25Similarity.getB(), 0f);
  assertEquals(bm25Similarity.getK1(), legacyBM25Similarity.getK1(), 0f);
}

Source File: BM25SimilarityFactory.java From lucene-solr with Apache License 2.0

4 votes

@Override
public Similarity getSimilarity() {
  BM25Similarity sim = new BM25Similarity(k1, b);
  sim.setDiscountOverlaps(discountOverlaps);
  return sim;
}

Source File: TestFeatureField.java From lucene-solr with Apache License 2.0

4 votes

public void testDemo() throws IOException {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
      .setMergePolicy(newLogMergePolicy(random().nextBoolean())));
  Document doc = new Document();
  FeatureField pagerank = new FeatureField("features", "pagerank", 1);
  doc.add(pagerank);
  TextField body = new TextField("body", "", Store.NO);
  doc.add(body);

  pagerank.setFeatureValue(10);
  body.setStringValue("Apache Lucene");
  writer.addDocument(doc);

  pagerank.setFeatureValue(1000);
  body.setStringValue("Apache Web HTTP server");
  writer.addDocument(doc);

  pagerank.setFeatureValue(1);
  body.setStringValue("Lucene is a search engine");
  writer.addDocument(doc);

  pagerank.setFeatureValue(42);
  body.setStringValue("Lucene in the sky with diamonds");
  writer.addDocument(doc);

  DirectoryReader reader = writer.getReader();
  writer.close();

  // NOTE: If you need to make changes below, then you likely also need to
  // update javadocs of FeatureField.

  IndexSearcher searcher = new IndexSearcher(reader);
  searcher.setSimilarity(new BM25Similarity());
  Query query = new BooleanQuery.Builder()
      .add(new TermQuery(new Term("body", "apache")), Occur.SHOULD)
      .add(new TermQuery(new Term("body", "lucene")), Occur.SHOULD)
      .build();
  Query boost = FeatureField.newSaturationQuery("features", "pagerank");
  Query boostedQuery = new BooleanQuery.Builder()
      .add(query, Occur.MUST)
      .add(boost, Occur.SHOULD)
      .build();
  TopDocs topDocs = searcher.search(boostedQuery, 10);
  assertEquals(4, topDocs.scoreDocs.length);
  assertEquals(1, topDocs.scoreDocs[0].doc);
  assertEquals(0, topDocs.scoreDocs[1].doc);
  assertEquals(3, topDocs.scoreDocs[2].doc);
  assertEquals(2, topDocs.scoreDocs[3].doc);

  reader.close();
  dir.close();
}

Source File: TestLegacyBM25Similarity.java From lucene-solr with Apache License 2.0

4 votes

public void testToString() {
  LegacyBM25Similarity legacyBM25Similarity = new LegacyBM25Similarity();
  BM25Similarity bm25Similarity = new BM25Similarity();
  assertEquals(bm25Similarity.toString(), legacyBM25Similarity.toString());
}

Source File: BM25FQuery.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Default builder.
 */
public Builder() {
  this.similarity = new BM25Similarity();
}

Source File: BM25NBClassifier.java From lucene-solr with Apache License 2.0

3 votes

/**
 * Creates a new NaiveBayes classifier.
 *
 * @param indexReader    the reader on the index to be used for classification
 * @param analyzer       an {@link Analyzer} used to analyze unseen text
 * @param query          a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
 *                       if all the indexed docs should be used
 * @param classFieldName the name of the field used as the output for the classifier NOTE: must not be heavely analyzed
 *                       as the returned class will be a token indexed for this field
 * @param textFieldNames the name of the fields used as the inputs for the classifier, NO boosting supported per field
 */
public BM25NBClassifier(IndexReader indexReader, Analyzer analyzer, Query query, String classFieldName, String... textFieldNames) {
  this.indexReader = indexReader;
  this.indexSearcher = new IndexSearcher(this.indexReader);
  this.indexSearcher.setSimilarity(new BM25Similarity());
  this.textFieldNames = textFieldNames;
  this.classFieldName = classFieldName;
  this.analyzer = analyzer;
  this.query = query;
}

Source File: LegacyBM25Similarity.java From lucene-solr with Apache License 2.0

2 votes

/**
 * BM25 with the supplied parameter values.
 * @param k1 Controls non-linear term frequency normalization (saturation).
 * @param b Controls to what degree document length normalizes tf values.
 * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is
 *         not within the range {@code [0..1]}
 */
public LegacyBM25Similarity(float k1, float b) {
  this.bm25Similarity = new BM25Similarity(k1, b);
}

Source File: LegacyBM25Similarity.java From lucene-solr with Apache License 2.0

2 votes

/** BM25 with these default values:
 * <ul>
 *   <li>{@code k1 = 1.2}</li>
 *   <li>{@code b = 0.75}</li>
 * </ul>
 */
public LegacyBM25Similarity() {
  this.bm25Similarity = new BM25Similarity();
}

Source File: BM25FQuery.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Builder with the supplied parameter values.
 * @param k1 Controls non-linear term frequency normalization (saturation).
 * @param b Controls to what degree document length normalizes tf values.
 */
public Builder(float k1, float b) {
  this.similarity = new BM25Similarity(k1, b);
}

org.apache.lucene.search.similarities.BM25Similarity Java Examples