org.apache.lucene.util.automaton.Automaton Java Exaples

Source File: TermAutomatonQuery.java From lucene-solr with Apache License 2.0

6 votes

public TermAutomatonWeight(Automaton automaton, IndexSearcher searcher, Map<Integer,TermStates> termStates, float boost) throws IOException {
  super(TermAutomatonQuery.this);
  this.automaton = automaton;
  this.termStates = termStates;
  this.similarity = searcher.getSimilarity();
  List<TermStatistics> allTermStats = new ArrayList<>();
  for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) {
    Integer termID = ent.getKey();
    if (ent.getValue() != null) {
      TermStates ts = termStates.get(termID);
      if (ts.docFreq() > 0) {
        allTermStats.add(searcher.termStatistics(new Term(field, ent.getValue()), ts.docFreq(), ts.totalTermFreq()));
      }
    }
  }

  if (allTermStats.isEmpty()) {
    stats = null; // no terms matched at all, will not use sim
  } else {
    stats = similarity.scorer(boost, searcher.collectionStatistics(field),
                                     allTermStats.toArray(new TermStatistics[allTermStats.size()]));
  }
}

Source File: TestDuelingAnalyzers.java From lucene-solr with Apache License 2.0

6 votes

@BeforeClass
public static void beforeClass() throws Exception {
  Automaton single = new Automaton();
  int initial = single.createState();
  int accept = single.createState();
  single.setAccept(accept, true);

  // build an automaton matching this jvm's letter definition
  for (int i = 0; i <= 0x10FFFF; i++) {
    if (Character.isLetter(i)) {
      single.addTransition(initial, accept, i);
    }
  }
  Automaton repeat = Operations.repeat(single);
  jvmLetter = new CharacterRunAutomaton(repeat);
}

Source File: AnalyzingSuggester.java From lucene-solr with Apache License 2.0

6 votes

final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
  // Analyze surface form:
  Automaton automaton;
  try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {

    // Create corresponding automaton: labels are bytes
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    automaton = ts2a.toAutomaton(ts);
  }

  automaton = replaceSep(automaton);
  automaton = convertAutomaton(automaton);

  // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
  // assert SpecialOperations.isFinite(automaton);

  // Get all paths from the automaton (there can be
  // more than one path, eg if the analyzer created a
  // graph using SynFilter or WDF):
  return automaton;
}

Source File: MtasToken.java From mtas with Apache License 2.0

6 votes

/**
 * Creates the automaton map.
 *
 * @param prefix the prefix
 * @param valueList the value list
 * @param filter the filter
 * @return the map
 */
public static Map<String, Automaton> createAutomatonMap(String prefix,
    List<String> valueList, Boolean filter) {
  HashMap<String, Automaton> automatonMap = new HashMap<>();
  if (valueList != null) {
    for (String item : valueList) {
      if (filter) {
        item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])",
            "\\\\$1");
      }
      automatonMap.put(item,
          new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*")
              .toAutomaton());
    }
  }
  return automatonMap;
}

Source File: TestSynonymGraphFilter.java From lucene-solr with Apache License 2.0

6 votes

/** Just creates a side path from startState to endState with the provided tokens. */
private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens, List<Integer> flatStates) {
  int lastState = startState;
  for(int i=0;i<tokens.length;i++) {
    int nextState;
    if (i == tokens.length-1) {
      nextState = endState;
    } else if (flatStates == null || i >= flatStates.size()) {
      nextState = a.createState();
      if (flatStates != null) {
        assert i == flatStates.size();
        flatStates.add(nextState);
      }
    } else {
      nextState = flatStates.get(i);
    }
    a.addTransition(lastState, nextState, tokens[i]);

    lastState = nextState;
  }
}

Source File: FuzzySuggester.java From lucene-solr with Apache License 2.0

6 votes

@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                     Automaton lookupAutomaton,
                                                                     FST<Pair<Long,BytesRef>> fst)
  throws IOException {

  // TODO: right now there's no penalty for fuzzy/edits,
  // ie a completion whose prefix matched exactly what the
  // user typed gets no boost over completions that
  // required an edit, which get no boost over completions
  // requiring two edits.  I suspect a multiplicative
  // factor is appropriate (eg, say a fuzzy match must be at
  // least 2X better weight than the non-fuzzy match to
  // "compete") ... in which case I think the wFST needs
  // to be log weights or something ...

  Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
  /*
    Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
    w.write(levA.toDot());
    w.close();
    System.out.println("Wrote LevA to out.dot");
  */
  return FSTUtil.intersectPrefixPaths(levA, fst);
}

Source File: PrefixQuery.java From lucene-solr with Apache License 2.0

6 votes

/** Build an automaton accepting all terms with the specified prefix. */
public static Automaton toAutomaton(BytesRef prefix) {
  final int numStatesAndTransitions = prefix.length+1;
  final Automaton automaton = new Automaton(numStatesAndTransitions, numStatesAndTransitions);
  int lastState = automaton.createState();
  for(int i=0;i<prefix.length;i++) {
    int state = automaton.createState();
    automaton.addTransition(lastState, state, prefix.bytes[prefix.offset+i]&0xff);
    lastState = state;
  }
  automaton.setAccept(lastState, true);
  automaton.addTransition(lastState, lastState, 0, 255);
  automaton.finishState();
  assert automaton.isDeterministic();
  return automaton;
}

Source File: GraphTokenStreamFiniteStrings.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns the articulation points (or cut vertices) of the graph:
 * https://en.wikipedia.org/wiki/Biconnected_component
 */
public int[] articulationPoints() {
  if (det.getNumStates() == 0) {
    return new int[0];
  }
  //
  Automaton.Builder undirect = new Automaton.Builder();
  undirect.copy(det);
  for (int i = 0; i < det.getNumStates(); i++) {
    int numT = det.initTransition(i, transition);
    for (int j = 0; j < numT; j++) {
      det.getNextTransition(transition);
      undirect.addTransition(transition.dest, i, transition.min);
    }
  }
  int numStates = det.getNumStates();
  BitSet visited = new BitSet(numStates);
  int[] depth = new int[det.getNumStates()];
  int[] low = new int[det.getNumStates()];
  int[] parent = new int[det.getNumStates()];
  Arrays.fill(parent, -1);
  List<Integer> points = new ArrayList<>();
  articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points);
  Collections.reverse(points);
  return points.stream().mapToInt(p -> p).toArray();
}

Source File: ContextMapping.java From Elasticsearch with Apache License 2.0

6 votes

/**
 * Create a automaton for a given context query this automaton will be used
 * to find the matching paths with the fst
 *
 * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query
 * @param queries list of {@link ContextQuery} defining the lookup context
 *
 * @return Automaton matching the given Query
 */
public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
    Automaton a = Automata.makeEmptyString();

    Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR);
    if (preserveSep) {
        // if separators are preserved the fst contains a SEP_LABEL
        // behind each gap. To have a matching automaton, we need to
        // include the SEP_LABEL in the query as well
        gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL));
    }

    for (ContextQuery query : queries) {
        a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
    }

    // TODO: should we limit this?  Do any of our ContextQuery impls really create exponential regexps?  GeoQuery looks safe (union
    // of strings).
    return Operations.determinize(a, Integer.MAX_VALUE);
}

Source File: IncludeExclude.java From Elasticsearch with Apache License 2.0

6 votes

private Automaton toAutomaton() {
    Automaton a = null;
    if (include != null) {
        a = include.toAutomaton();
    } else if (includeValues != null) {
        a = Automata.makeStringUnion(includeValues);
    } else {
        a = Automata.makeAnyString();
    }
    if (exclude != null) {
        a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    } else if (excludeValues != null) {
        a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    }
    return a;
}

Source File: TestRegexpQuery.java From lucene-solr with Apache License 2.0

6 votes

public void testCustomProvider() throws IOException {
  AutomatonProvider myProvider = new AutomatonProvider() {
    // automaton that matches quick or brown
    private Automaton quickBrownAutomaton = Operations.union(Arrays
        .asList(Automata.makeString("quick"),
        Automata.makeString("brown"),
        Automata.makeString("bob")));
    
    @Override
    public Automaton getAutomaton(String name) {
      if (name.equals("quickBrown")) return quickBrownAutomaton;
      else return null;
    }
  };
  RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
    myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
  assertEquals(1, searcher.search(query, 5).totalHits.value);
}

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;

    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }

    automaton = replaceSep(automaton);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert

    // This automaton should not blow up during determinize:
    automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
    return automaton;
}

Source File: FuzzyCompletionQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
  final Automaton originalAutomata;
  try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
    originalAutomata = stream.toAutomaton(unicodeAware);
  }
  Set<IntsRef> refs = new HashSet<>();
  Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
  if (unicodeAware) {
    Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
    utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
    automaton = utf8automaton;
  }
  // TODO Accumulating all refs is bad, because the resulting set may be very big.
  // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
  return new FuzzyCompletionWeight(this, automaton, refs);
}

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

6 votes

private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
  final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
  final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
  if (matchAllContexts || contexts.size() == 0) {
    return Operations.concatenate(matchAllAutomaton, sep);
  } else {
    Automaton contextsAutomaton = null;
    for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
      final ContextMetaData contextMetaData = entry.getValue();
      final IntsRef ref = entry.getKey();
      Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
      if (contextMetaData.exact == false) {
        contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
      }
      contextAutomaton = Operations.concatenate(contextAutomaton, sep);
      if (contextsAutomaton == null) {
        contextsAutomaton = contextAutomaton;
      } else {
        contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
      }
    }
    return contextsAutomaton;
  }
}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testSynOverMultipleHoles() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("a", 1, 1),
      token("x", 0, 3),
      token("b", 3, 1),
    });
  final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
  final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
  assertSameLanguage(Operations.union(a1, a2), ts);
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

private void assertAutomatonHits(int expected, Automaton automaton)
    throws IOException {
  AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton);
  
  query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
  
  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
  
  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
}

Source File: TestAutomatonQueryUnicode.java From lucene-solr with Apache License 2.0

5 votes

private void assertAutomatonHits(int expected, Automaton automaton)
    throws IOException {
  AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton);

  query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));

  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));

  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
}

Source File: TestAutomatonQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test that rewriting to a prefix query works as expected, preserves
 * MultiTermQuery semantics.
 */
public void testRewritePrefix() throws IOException {
  Automaton pfx = Automata.makeString("do");
  Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
  AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
  assertEquals(3, automatonQueryNrHits(aq));
}

Source File: XContentMapValues.java From crate with Apache License 2.0

5 votes

/**
 * Returns a function that filters a document map based on the given include and exclude rules.
 * @see #filter(Map, String[], String[]) for details
 */
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
    CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());

    CharacterRunAutomaton include;
    if (includes == null || includes.length == 0) {
        include = matchAllAutomaton;
    } else {
        Automaton includeA = Regex.simpleMatchToAutomaton(includes);
        includeA = makeMatchDotsInFieldNames(includeA);
        include = new CharacterRunAutomaton(includeA);
    }

    Automaton excludeA;
    if (excludes == null || excludes.length == 0) {
        excludeA = Automata.makeEmpty();
    } else {
        excludeA = Regex.simpleMatchToAutomaton(excludes);
        excludeA = makeMatchDotsInFieldNames(excludeA);
    }
    CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);

    // NOTE: We cannot use Operations.minus because of the special case that
    // we want all sub properties to match as soon as an object matches

    return (map) -> filter(map,
        include, 0,
        exclude, 0,
        matchAllAutomaton);
}

Source File: TestIndexWriter.java From lucene-solr with Apache License 2.0

5 votes

public void testStopwordsPosIncHole2() throws Exception {
  // use two stopfilters for testing here
  Directory dir = newDirectory();
  final Automaton secondSet = Automata.makeString("foobar");
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer();
      TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
      stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
  Document doc = new Document();
  doc.add(new TextField("body", "just a foobar", Field.Store.NO));
  doc.add(new TextField("body", "test of gaps", Field.Store.NO));
  iw.addDocument(doc);
  IndexReader ir = iw.getReader();
  iw.close();
  IndexSearcher is = newSearcher(ir);
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  builder.add(new Term("body", "just"), 0);
  builder.add(new Term("body", "test"), 3);
  PhraseQuery pq = builder.build();
  // body:"just ? ? test"
  assertEquals(1, is.search(pq, 5).totalHits.value);
  ir.close();
  dir.close();
}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

public void testOverlappedTokensLattice() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 2),
        token("def", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

5 votes

/**
 * Creates a new suggester.
 *
 * @param indexAnalyzer Analyzer that will be used for
 *   analyzing suggestions while building the index.
 * @param queryAnalyzer Analyzer that will be used for
 *   analyzing query text during lookup
 * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
 * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
 *   surface forms to keep for a single analyzed form.
 *   When there are too many surface forms we discard the
 *   lowest weighted ones.
 * @param maxGraphExpansions Maximum number of graph paths
 *   to expand from the analyzed form.  Set this to -1 for
 *   no limit.
 */
public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                           boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
                           int sepLabel, int payloadSep, int endByte, int holeCharacter) {
    // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
  this.indexAnalyzer = indexAnalyzer;
  this.queryAnalyzer = queryAnalyzer;
  this.fst = fst;
  this.hasPayloads = hasPayloads;
  if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
    throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
  }
  this.exactFirst = (options & EXACT_FIRST) != 0;
  this.preserveSep = (options & PRESERVE_SEP) != 0;

  // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions
  this.queryPrefix = queryPrefix;

  // NOTE: this is just an implementation limitation; if
  // somehow this is a problem we could fix it by using
  // more than one byte to disambiguate ... but 256 seems
  // like it should be way more then enough.
  if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
    throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
  }
  this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

  if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
    throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
  }
  this.maxGraphExpansions = maxGraphExpansions;
  this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
  this.preservePositionIncrements = preservePositionIncrements;
  this.sepLabel = sepLabel;
  this.payloadSep = payloadSep;
  this.endByte = endByte;
  this.holeCharacter = holeCharacter;
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testNotDeterminized() throws Exception {
  Automaton a = new Automaton();
  int start = a.createState();
  int mid1 = a.createState();
  int mid2 = a.createState();
  int end = a.createState();
  a.setAccept(end, true);
  a.addTransition(start, mid1, 'a', 'z');
  a.addTransition(start, mid2, 'a', 'z');
  a.addTransition(mid1, end, 'b');
  a.addTransition(mid2, end, 'b');
  expectThrows(IllegalArgumentException.class, () -> {new SimplePatternTokenizer(a);});
}

Source File: SimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

/** Runs a pre-built automaton. */
public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}

Source File: TestSynonymGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

private Automaton toAutomaton(TokenStream ts) throws IOException {
  PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
  PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
  CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  ts.reset();
  Automaton a = new Automaton();
  int srcNode = -1;
  int destNode = -1;
  int state = a.createState();
  while (ts.incrementToken()) {
    assert termAtt.length() == 1;
    char c = termAtt.charAt(0);
    int posInc = posIncAtt.getPositionIncrement();
    if (posInc != 0) {
      srcNode += posInc;
      while (state < srcNode) {
        state = a.createState();
      }
    }
    destNode = srcNode + posLenAtt.getPositionLength();
    while (state < destNode) {
      state = a.createState();
    }
    a.addTransition(srcNode, destNode, c);
  }
  ts.end();
  ts.close();
  a.finishState();
  a.setAccept(destNode, true);
  return a;
}

Source File: ConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public boolean incrementToken() throws IOException {
  if (finiteStrings == null) {
    if (wasReset == false) {
      throw new IllegalStateException("reset() missing before incrementToken");
    }
    // lazy init/consume
    Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
    finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
    endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
  }

  IntsRef string = finiteStrings.next();
  if (string == null) {
    return false;
  }

  clearAttributes();

  if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
    posIncrAtt.setPositionIncrement(0); // stacked
  }

  offsetAtt.setOffset(0, endOffset);

  Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
  if (charTermAttribute != null) {
    charTermAttribute.setLength(0);
    charTermAttribute.append(bytesAtt.toUTF16());
  }

  return true;
}

Source File: TestSynonymGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

private boolean approxSubsetOf(Automaton a1, Automaton a2) {
  AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(a1);
  for(int i=0;i<2000;i++) {
    int[] ints = ras.getRandomAcceptedString(random());
    IntsRef path = new IntsRef(ints, 0, ints.length);
    if (accepts(a2, path) == false) {
      throw new RuntimeException("a2 does not accept " + path);
    }
  }

  // Presumed true
  return true;
}

Source File: GraphEdgeCollector.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Query getResultQuery(SchemaField matchField, boolean useAutomaton) {
  if (collectorTerms == null || collectorTerms.size() == 0) {
    // return null if there are no terms (edges) to traverse.
    return null;
  } else {
    // Create a query
    Query q = null;

    // TODO: see if we should dynamically select this based on the frontier size.
    if (useAutomaton) {
      // build an automaton based query for the frontier.
      Automaton autn = buildAutomaton(collectorTerms);
      AutomatonQuery autnQuery = new AutomatonQuery(new Term(matchField.getName()), autn);
      q = autnQuery;
    } else {
      List<BytesRef> termList = new ArrayList<>(collectorTerms.size());
      for (int i = 0; i < collectorTerms.size(); i++) {
        BytesRef ref = new BytesRef();
        collectorTerms.get(i, ref);
        termList.add(ref);
      }
      q = (matchField.hasDocValues() && !matchField.indexed())
              ? new DocValuesTermsQuery(matchField.getName(), termList)
              : new TermInSetQuery(matchField.getName(), termList);
    }

    return q;
  }
}

Source File: Regex.java From crate with Apache License 2.0

5 votes

/**
 * Return an Automaton that matches the union of the provided patterns.
 */
public static Automaton simpleMatchToAutomaton(String... patterns) {
    if (patterns.length < 1) {
        throw new IllegalArgumentException("There must be at least one pattern, zero given");
    }
    List<Automaton> automata = new ArrayList<>();
    for (String pattern : patterns) {
        automata.add(simpleMatchToAutomaton(pattern));
    }
    return Operations.union(automata);
}

Source File: GraphEdgeCollector.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Build an automaton to represent the frontier query
 */
private Automaton buildAutomaton(BytesRefHash termBytesHash) {
  // need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
  final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
  for (int i = 0; i < termBytesHash.size(); i++) {
    BytesRef ref = new BytesRef();
    termBytesHash.get(i, ref);
    terms.add(ref);
  }
  final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
  return a;
}

org.apache.lucene.util.automaton.Automaton Java Examples