org.apache.lucene.util.automaton.Operations Java Exaples

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;

    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }

    automaton = replaceSep(automaton);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert

    // This automaton should not blow up during determinize:
    automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
    return automaton;
}

Source File: IncludeExclude.java From Elasticsearch with Apache License 2.0

6 votes

private Automaton toAutomaton() {
    Automaton a = null;
    if (include != null) {
        a = include.toAutomaton();
    } else if (includeValues != null) {
        a = Automata.makeStringUnion(includeValues);
    } else {
        a = Automata.makeAnyString();
    }
    if (exclude != null) {
        a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    } else if (excludeValues != null) {
        a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    }
    return a;
}

Source File: ContextMapping.java From Elasticsearch with Apache License 2.0

6 votes

/**
 * Create a automaton for a given context query this automaton will be used
 * to find the matching paths with the fst
 *
 * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query
 * @param queries list of {@link ContextQuery} defining the lookup context
 *
 * @return Automaton matching the given Query
 */
public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
    Automaton a = Automata.makeEmptyString();

    Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR);
    if (preserveSep) {
        // if separators are preserved the fst contains a SEP_LABEL
        // behind each gap. To have a matching automaton, we need to
        // include the SEP_LABEL in the query as well
        gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL));
    }

    for (ContextQuery query : queries) {
        a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
    }

    // TODO: should we limit this?  Do any of our ContextQuery impls really create exponential regexps?  GeoQuery looks safe (union
    // of strings).
    return Operations.determinize(a, Integer.MAX_VALUE);
}

Source File: TestRegexpQuery.java From lucene-solr with Apache License 2.0

6 votes

public void testCustomProvider() throws IOException {
  AutomatonProvider myProvider = new AutomatonProvider() {
    // automaton that matches quick or brown
    private Automaton quickBrownAutomaton = Operations.union(Arrays
        .asList(Automata.makeString("quick"),
        Automata.makeString("brown"),
        Automata.makeString("bob")));
    
    @Override
    public Automaton getAutomaton(String name) {
      if (name.equals("quickBrown")) return quickBrownAutomaton;
      else return null;
    }
  };
  RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
    myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
  assertEquals(1, searcher.search(query, 5).totalHits.value);
}

Source File: TestDuelingAnalyzers.java From lucene-solr with Apache License 2.0

6 votes

@BeforeClass
public static void beforeClass() throws Exception {
  Automaton single = new Automaton();
  int initial = single.createState();
  int accept = single.createState();
  single.setAccept(accept, true);

  // build an automaton matching this jvm's letter definition
  for (int i = 0; i <= 0x10FFFF; i++) {
    if (Character.isLetter(i)) {
      single.addTransition(initial, accept, i);
    }
  }
  Automaton repeat = Operations.repeat(single);
  jvmLetter = new CharacterRunAutomaton(repeat);
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Test some very simple automata.
 */
public void testAutomata() throws IOException {
  assertAutomatonHits(0, Automata.makeEmpty());
  assertAutomatonHits(0, Automata.makeEmptyString());
  assertAutomatonHits(2, Automata.makeAnyChar());
  assertAutomatonHits(3, Automata.makeAnyString());
  assertAutomatonHits(2, Automata.makeString("doc"));
  assertAutomatonHits(1, Automata.makeChar('a'));
  assertAutomatonHits(2, Automata.makeCharRange('a', 'b'));
  assertAutomatonHits(2, Automata.makeDecimalInterval(1233, 2346, 0));
  assertAutomatonHits(1, Automata.makeDecimalInterval(0, 2000, 0));
  assertAutomatonHits(2, Operations.union(Automata.makeChar('a'),
      Automata.makeChar('b')));
  assertAutomatonHits(0, Operations.intersection(Automata
      .makeChar('a'), Automata.makeChar('b')));
  assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'), 
      Automata.makeChar('a'), DEFAULT_MAX_DETERMINIZED_STATES));
}

Source File: FuzzyCompletionQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
  final Automaton originalAutomata;
  try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
    originalAutomata = stream.toAutomaton(unicodeAware);
  }
  Set<IntsRef> refs = new HashSet<>();
  Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
  if (unicodeAware) {
    Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
    utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
    automaton = utf8automaton;
  }
  // TODO Accumulating all refs is bad, because the resulting set may be very big.
  // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
  return new FuzzyCompletionWeight(this, automaton, refs);
}

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

6 votes

private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
  final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
  final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
  if (matchAllContexts || contexts.size() == 0) {
    return Operations.concatenate(matchAllAutomaton, sep);
  } else {
    Automaton contextsAutomaton = null;
    for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
      final ContextMetaData contextMetaData = entry.getValue();
      final IntsRef ref = entry.getKey();
      Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
      if (contextMetaData.exact == false) {
        contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
      }
      contextAutomaton = Operations.concatenate(contextAutomaton, sep);
      if (contextsAutomaton == null) {
        contextsAutomaton = contextAutomaton;
      } else {
        contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
      }
    }
    return contextsAutomaton;
  }
}

Source File: TermInSetQuery.java From lucene-solr with Apache License 2.0

5 votes

private ByteRunAutomaton asByteRunAutomaton() {
  TermIterator iterator = termData.iterator();
  List<Automaton> automata = new ArrayList<>();
  for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
    automata.add(Automata.makeBinary(term));
  }
  return new CompiledAutomaton(Operations.union(automata)).runAutomaton;

}

Source File: TestMockAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

/** Test a configuration that behaves a lot like KeepWordFilter */
public void testKeep() throws Exception {
  CharacterRunAutomaton keepWords = 
    new CharacterRunAutomaton(
        Operations.complement(
            Operations.union(
                Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))),
            DEFAULT_MAX_DETERMINIZED_STATES));
  Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
  assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
      new String[] { "foo", "bar", "bar", "foo" },
      new int[] { 2, 2, 1, 2 });
}

Source File: TestFuzzyQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testErrorMessage() {
  // 45 states per vector from Lev2TParametricDescription
  final int length = (Operations.DEFAULT_MAX_DETERMINIZED_STATES / 45) + 10;
  final String value = randomRealisticMultiByteUnicode(length);

  FuzzyTermsEnum.FuzzyTermsException expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class, () -> {
    new FuzzyAutomatonBuilder(value, 2, 0, true).buildMaxEditAutomaton();
  });
  assertThat(expected.getMessage(), containsString(value));

  expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class,
      () -> new FuzzyAutomatonBuilder(value, 2, 0, true).buildAutomatonSet());
  assertThat(expected.getMessage(), containsString(value));
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test that a nondeterministic automaton works correctly. (It should will be
 * determinized)
 */
public void testNFA() throws IOException {
  // accept this or three, the union is an NFA (two transitions for 't' from
  // initial state)
  Automaton nfa = Operations.union(Automata.makeString("this"),
      Automata.makeString("three"));
  assertAutomatonHits(2, nfa);
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testEquals() {
  AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata
      .makeString("foobar"));
  // reference to a1
  AutomatonQuery a2 = a1;
  // same as a1 (accepts the same language, same term)
  AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"),
                          Operations.concatenate(
                               Automata.makeString("foo"),
                               Automata.makeString("bar")));
  // different than a1 (same term, but different language)
  AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"),
                                         Automata.makeString("different"));
  // different than a1 (different term, same language)
  AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"),
                                         Automata.makeString("foobar"));
  
  assertEquals(a1.hashCode(), a2.hashCode());
  assertEquals(a1, a2);
  
  assertEquals(a1.hashCode(), a3.hashCode());
  assertEquals(a1, a3);

  // different class
  AutomatonQuery w1 = new WildcardQuery(newTerm("foobar"));
  // different class
  AutomatonQuery w2 = new RegexpQuery(newTerm("foobar"));
  
  assertFalse(a1.equals(w1));
  assertFalse(a1.equals(w2));
  assertFalse(w1.equals(w2));
  assertFalse(a1.equals(a4));
  assertFalse(a1.equals(a5));
  assertFalse(a1.equals(null));
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test that rewriting to a prefix query works as expected, preserves
 * MultiTermQuery semantics.
 */
public void testRewritePrefix() throws IOException {
  Automaton pfx = Automata.makeString("do");
  Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
  AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
  assertEquals(3, automatonQueryNrHits(aq));
}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testSynOverMultipleHoles() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("a", 1, 1),
      token("x", 0, 3),
      token("b", 3, 1),
    });
  final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
  final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
  assertSameLanguage(Operations.union(a1, a2), ts);
}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

private Automaton join(String ... strings) {
  List<Automaton> as = new ArrayList<>();
  for(String s : strings) {
    as.add(s2a(s));
    as.add(SEP_A);
  }
  as.remove(as.size()-1);
  return Operations.concatenate(as);
}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testOverlappedTokensSausage() throws Exception {

    // Two tokens on top of each other (sausage):
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 1)
      });
    final Automaton a1 = s2a("abc");
    final Automaton a2 = s2a("xyz");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testOverlappedTokensLattice() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 2),
        token("def", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testSynOverHole() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("X", 0, 2),
        token("b", 2, 1),
      });
    final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
    final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
    assertSameLanguage(expected, ts);
  }

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testSynOverHole2() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("xyz", 1, 1),
        token("abc", 0, 3),
        token("def", 2, 1),
      });
    final Automaton expected = Operations.union(
      join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
    assertSameLanguage(expected, ts);
  }

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testOverlappedTokensLattice2() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 3),
        token("def", 1, 1),
        token("ghi", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def", "ghi");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testSynHangingOverEnd() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("a", 1, 1),
      token("X", 0, 10),
    });
  assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testTokenStreamGraphWithHoles() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("abc", 1, 1),
      token("xyz", 1, 8),
      token("def", 1, 1),
      token("ghi", 1, 1),
    });
  assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
                                      join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
}

Source File: TestReversedWildcardFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** fragile assert: depends on our implementation, but cleanest way to check for now */ 
private boolean wasReversed(SolrQueryParser qp, String query) throws Exception {
  Query q = qp.parse(query);
  if (!(q instanceof AutomatonQuery)) {
    return false;
  }
  Automaton automaton = ((AutomatonQuery) q).getAutomaton();
  String prefix = Operations.getCommonPrefix(Operations.determinize(automaton,
    Operations.DEFAULT_MAX_DETERMINIZED_STATES));
  return prefix.length() > 0 && prefix.charAt(0) == '\u0001';
}

Source File: Regex.java From crate with Apache License 2.0

5 votes

/** Return an {@link Automaton} that matches the given pattern. */
public static Automaton simpleMatchToAutomaton(String pattern) {
    List<Automaton> automata = new ArrayList<>();
    int previous = 0;
    for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) {
        automata.add(Automata.makeString(pattern.substring(previous, i)));
        automata.add(Automata.makeAnyString());
        previous = i + 1;
    }
    automata.add(Automata.makeString(pattern.substring(previous)));
    return Operations.concatenate(automata);
}

Source File: Regex.java From crate with Apache License 2.0

5 votes

/**
 * Return an Automaton that matches the union of the provided patterns.
 */
public static Automaton simpleMatchToAutomaton(String... patterns) {
    if (patterns.length < 1) {
        throw new IllegalArgumentException("There must be at least one pattern, zero given");
    }
    List<Automaton> automata = new ArrayList<>();
    for (String pattern : patterns) {
        automata.add(simpleMatchToAutomaton(pattern));
    }
    return Operations.union(automata);
}

Source File: CategoryContextMapping.java From Elasticsearch with Apache License 2.0

5 votes

@Override
public Automaton toAutomaton() {
    List<Automaton> automatons = new ArrayList<>();
    for (CharSequence value : values) {
        automatons.add(Automata.makeString(value.toString()));
    }
    return Operations.union(automatons);
}

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

5 votes

protected Automaton convertAutomaton(Automaton a) {
  if (queryPrefix != null) {
    a = Operations.concatenate(Arrays.asList(queryPrefix, a));
    // This automaton should not blow up during determinize:
    a = Operations.determinize(a, Integer.MAX_VALUE);
  }
  return a;
}

Source File: TestSynonymGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
  int[] newToOld = Operations.topoSortStates(in);
  int[] oldToNew = new int[newToOld.length];

  Automaton.Builder a = new Automaton.Builder();
  //System.out.println("remap:");
  for(int i=0;i<newToOld.length;i++) {
    a.createState();
    oldToNew[newToOld[i]] = i;
    //System.out.println("  " + newToOld[i] + " -> " + i);
    if (in.isAccept(newToOld[i])) {
      a.setAccept(i, true);
      //System.out.println("    **");
    }
  }

  Transition t = new Transition();
  for(int i=0;i<newToOld.length;i++) {
    int count = in.initTransition(newToOld[i], t);
    for(int j=0;j<count;j++) {
      in.getNextTransition(t);
      a.addTransition(i, oldToNew[t.dest], t.min, t.max);
    }
  }

  return a.finish();
}

Source File: ConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Converts the tokenStream to an automaton.  Does *not* close it.
 */
public Automaton toAutomaton(boolean unicodeAware) throws IOException {
  // TODO refactor this
  // maybe we could hook up a modified automaton from TermAutomatonQuery here?

  // Create corresponding automaton: labels are bytes
  // from each analyzed token, with byte 0 used as
  // separator between tokens:
  final TokenStreamToAutomaton tsta;
  if (tokenSeparator != null) {
    tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
  } else {
    // When we're not preserving sep, we don't steal 0xff
    // byte, so we don't need to do any escaping:
    tsta = new TokenStreamToAutomaton();
  }
  tsta.setPreservePositionIncrements(preservePositionIncrements);
  tsta.setUnicodeArcs(unicodeAware);

  Automaton automaton = tsta.toAutomaton(inputTokenStream);

  // TODO: we can optimize this somewhat by determinizing
  // while we convert
  automaton = replaceSep(automaton, tokenSeparator);
  // This automaton should not blow up during determinize:
  return Operations.determinize(automaton, maxGraphExpansions);
}

org.apache.lucene.util.automaton.Operations Java Examples