org.apache.lucene.util.automaton.Automata#makeString

Source File: TestQPHelper.java From lucene-solr with Apache License 2.0

7 votes

public void testBoost() throws Exception {
  CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
  Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
  StandardQueryParser qp = new StandardQueryParser();
  qp.setAnalyzer(oneStopAnalyzer);

  Query q = qp.parse("on^1.0", "field");
  assertNotNull(q);
  q = qp.parse("\"hello\"^2.0", "field");
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = qp.parse("hello^2.0", "field");
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = qp.parse("\"on\"^1.0", "field");
  assertNotNull(q);

  StandardQueryParser qp2 = new StandardQueryParser();
  qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));

  q = qp2.parse("the^3", "field");
  // "the" is a stop word so the result is an empty query:
  assertNotNull(q);
  assertMatchNoDocsQuery(q);
  assertFalse(q instanceof BoostQuery);
}

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

6 votes

private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
  final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
  final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
  if (matchAllContexts || contexts.size() == 0) {
    return Operations.concatenate(matchAllAutomaton, sep);
  } else {
    Automaton contextsAutomaton = null;
    for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
      final ContextMetaData contextMetaData = entry.getValue();
      final IntsRef ref = entry.getKey();
      Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
      if (contextMetaData.exact == false) {
        contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
      }
      contextAutomaton = Operations.concatenate(contextAutomaton, sep);
      if (contextsAutomaton == null) {
        contextsAutomaton = contextAutomaton;
      } else {
        contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
      }
    }
    return contextsAutomaton;
  }
}

Source File: TestPrecedenceQueryParser.java From lucene-solr with Apache License 2.0

6 votes

public void testBoost() throws Exception {
  CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
  Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);

  PrecedenceQueryParser qp = new PrecedenceQueryParser();
  qp.setAnalyzer(oneStopAnalyzer);
  Query q = qp.parse("on^1.0", "field");
  assertNotNull(q);
  q = qp.parse("\"hello\"^2.0", "field");
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = qp.parse("hello^2.0", "field");
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = qp.parse("\"on\"^1.0", "field");
  assertNotNull(q);

  q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3",
      "field");
  assertNotNull(q);
}

Source File: QueryParserTestBase.java From lucene-solr with Apache License 2.0

6 votes

public void testBoost()
  throws Exception {
  CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on"));
  Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
  CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
  Query q = getQuery("on^1.0",qp);
  assertNotNull(q);
  q = getQuery("\"hello\"^2.0",qp);
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = getQuery("hello^2.0",qp);
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = getQuery("\"on\"^1.0",qp);
  assertNotNull(q);

  Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); 
  CommonQueryParserConfiguration qp2 = getParserConfig(a2);
  q = getQuery("the^3", qp2);
  // "the" is a stop word so the result is an empty query:
  assertNotNull(q);
  assertMatchNoDocsQuery(q);
  assertFalse(q instanceof BoostQuery);
}

Source File: GeolocationContextMapping.java From Elasticsearch with Apache License 2.0

5 votes

@Override
public Automaton toAutomaton() {
    Automaton automaton;
    if(precisions == null || precisions.length == 0) {
         automaton = Automata.makeString(location);
    } else {
        automaton = Automata.makeString(location.substring(0, Math.max(1, Math.min(location.length(), precisions[0]))));
        for (int i = 1; i < precisions.length; i++) {
            final String cell = location.substring(0, Math.max(1, Math.min(location.length(), precisions[i])));
            automaton = Operations.union(automaton, Automata.makeString(cell));
        }
    }
    return automaton;
}

Source File: TokenStreamOffsetStrategy.java From lucene-solr with Apache License 2.0

5 votes

private static CharArrayMatcher[] convertTermsToMatchers(BytesRef[] terms, CharArrayMatcher[] matchers) {
  CharArrayMatcher[] newAutomata = new CharArrayMatcher[terms.length + matchers.length];
  for (int i = 0; i < terms.length; i++) {
    String termString = terms[i].utf8ToString();
    CharacterRunAutomaton a = new CharacterRunAutomaton(Automata.makeString(termString));
    newAutomata[i] = LabelledCharArrayMatcher.wrap(termString, a::run);
  }
  // Append existing automata (that which is used for MTQs)
  System.arraycopy(matchers, 0, newAutomata, terms.length, matchers.length);
  return newAutomata;
}

Source File: SearchEquivalenceTestBase.java From lucene-solr with Apache License 2.0

5 votes

@BeforeClass
public static void beforeClass() throws Exception {
  Random random = random();
  directory = newDirectory();
  stopword = "" + randomChar();
  CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword));
  analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
  RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
  Document doc = new Document();
  Field id = new StringField("id", "", Field.Store.NO);
  Field field = new TextField("field", "", Field.Store.NO);
  doc.add(id);
  doc.add(field);
  
  // index some docs
  int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
  for (int i = 0; i < numDocs; i++) {
    id.setStringValue(Integer.toString(i));
    field.setStringValue(randomFieldContents());
    iw.addDocument(doc);
  }
  
  // delete some docs
  int numDeletes = numDocs/20;
  for (int i = 0; i < numDeletes; i++) {
    Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs)));
    if (random.nextBoolean()) {
      iw.deleteDocuments(toDelete);
    } else {
      iw.deleteDocuments(new TermQuery(toDelete));
    }
  }
  
  reader = iw.getReader();
  s1 = newSearcher(reader);
  s2 = newSearcher(reader);
  iw.close();
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testEquals() {
  AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata
      .makeString("foobar"));
  // reference to a1
  AutomatonQuery a2 = a1;
  // same as a1 (accepts the same language, same term)
  AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"),
                          Operations.concatenate(
                               Automata.makeString("foo"),
                               Automata.makeString("bar")));
  // different than a1 (same term, but different language)
  AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"),
                                         Automata.makeString("different"));
  // different than a1 (different term, same language)
  AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"),
                                         Automata.makeString("foobar"));
  
  assertEquals(a1.hashCode(), a2.hashCode());
  assertEquals(a1, a2);
  
  assertEquals(a1.hashCode(), a3.hashCode());
  assertEquals(a1, a3);

  // different class
  AutomatonQuery w1 = new WildcardQuery(newTerm("foobar"));
  // different class
  AutomatonQuery w2 = new RegexpQuery(newTerm("foobar"));
  
  assertFalse(a1.equals(w1));
  assertFalse(a1.equals(w2));
  assertFalse(w1.equals(w2));
  assertFalse(a1.equals(a4));
  assertFalse(a1.equals(a5));
  assertFalse(a1.equals(null));
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test that rewriting to a single term works as expected, preserves
 * MultiTermQuery semantics.
 */
public void testRewriteSingleTerm() throws IOException {
  AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeString("piece"));
  Terms terms = MultiTerms.getTerms(searcher.getIndexReader(), FN);
  assertTrue(aq.getTermsEnum(terms) instanceof SingleTermsEnum);
  assertEquals(1, automatonQueryNrHits(aq));
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test that rewriting to a prefix query works as expected, preserves
 * MultiTermQuery semantics.
 */
public void testRewritePrefix() throws IOException {
  Automaton pfx = Automata.makeString("do");
  Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
  AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
  assertEquals(3, automatonQueryNrHits(aq));
}

Source File: TestIndexWriter.java From lucene-solr with Apache License 2.0

5 votes

public void testStopwordsPosIncHole2() throws Exception {
  // use two stopfilters for testing here
  Directory dir = newDirectory();
  final Automaton secondSet = Automata.makeString("foobar");
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer();
      TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
      stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
  Document doc = new Document();
  doc.add(new TextField("body", "just a foobar", Field.Store.NO));
  doc.add(new TextField("body", "test of gaps", Field.Store.NO));
  iw.addDocument(doc);
  IndexReader ir = iw.getReader();
  iw.close();
  IndexSearcher is = newSearcher(ir);
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  builder.add(new Term("body", "just"), 0);
  builder.add(new Term("body", "test"), 3);
  PhraseQuery pq = builder.build();
  // body:"just ? ? test"
  assertEquals(1, is.search(pq, 5).totalHits.value);
  ir.close();
  dir.close();
}

Source File: HighlighterTest.java From lucene-solr with Apache License 2.0

4 votes

public void testMaxSizeHighlightTruncates() throws Exception {
  TestHighlightRunner helper = new TestHighlightRunner() {

    @Override
    public void run() throws Exception {
      String goodWord = "goodtoken";
      CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("stoppedtoken"));
      // we disable MockTokenizer checks because we will forcefully limit the 
      // tokenstream and call end() before incrementToken() returns false.
      final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
      analyzer.setEnableChecks(false);
      TermQuery query = new TermQuery(new Term("data", goodWord));

      String match;
      StringBuilder sb = new StringBuilder();
      sb.append(goodWord);
      for (int i = 0; i < 10000; i++) {
        sb.append(" ");
        // only one stopword
        sb.append("stoppedtoken");
      }
      SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
      Highlighter hg = getHighlighter(query, "data", fm);// new Highlighter(fm,
      // new
      // QueryTermScorer(query));
      hg.setTextFragmenter(new NullFragmenter());
      hg.setMaxDocCharsToAnalyze(100);
      match = hg.getBestFragment(analyzer, "data", sb.toString());
      assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
          .getMaxDocCharsToAnalyze());

      // add another tokenized word to the overrall length - but set way
      // beyond
      // the length of text under consideration (after a large slug of stop
      // words
      // + whitespace)
      sb.append(" ");
      sb.append(goodWord);
      match = hg.getBestFragment(analyzer, "data", sb.toString());
      assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
          .getMaxDocCharsToAnalyze());
    }
  };

  helper.start();

}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

4 votes

private Automaton s2a(String s) {
  return Automata.makeString(s);
}

Java Code Examples for org.apache.lucene.util.automaton.Automata#makeString()