org.apache.lucene.util.automaton.Transition Java Exaples

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

private int[] topoSortStates(Automaton a) {
  int[] states = new int[a.getNumStates()];
  final Set<Integer> visited = new HashSet<>();
  final LinkedList<Integer> worklist = new LinkedList<>();
  worklist.add(0);
  visited.add(0);
  int upto = 0;
  states[upto] = 0;
  upto++;
  Transition t = new Transition();
  while (worklist.size() > 0) {
    int s = worklist.removeFirst();
    int count = a.initTransition(s, t);
    for (int i=0;i<count;i++) {
      a.getNextTransition(t);
      if (!visited.contains(t.dest)) {
        visited.add(t.dest);
        worklist.add(t.dest);
        states[upto++] = t.dest;
      }
    }
  }
  return states;
}

Source File: TestSynonymGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
  int[] newToOld = Operations.topoSortStates(in);
  int[] oldToNew = new int[newToOld.length];

  Automaton.Builder a = new Automaton.Builder();
  //System.out.println("remap:");
  for(int i=0;i<newToOld.length;i++) {
    a.createState();
    oldToNew[newToOld[i]] = i;
    //System.out.println("  " + newToOld[i] + " -> " + i);
    if (in.isAccept(newToOld[i])) {
      a.setAccept(i, true);
      //System.out.println("    **");
    }
  }

  Transition t = new Transition();
  for(int i=0;i<newToOld.length;i++) {
    int count = in.initTransition(newToOld[i], t);
    for(int j=0;j<count;j++) {
      in.getNextTransition(t);
      a.addTransition(i, oldToNew[t.dest], t.min, t.max);
    }
  }

  return a.finish();
}

Source File: GraphTokenStreamFiniteStrings.java From lucene-solr with Apache License 2.0

5 votes

private static void articulationPointsRecurse(Automaton a, int state, int d, int[] depth, int[] low, int[] parent,
                                              BitSet visited, List<Integer> points) {
  visited.set(state);
  depth[state] = d;
  low[state] = d;
  int childCount = 0;
  boolean isArticulation = false;
  Transition t = new Transition();
  int numT = a.initTransition(state, t);
  for (int i = 0; i < numT; i++) {
    a.getNextTransition(t);
    if (visited.get(t.dest) == false) {
      parent[t.dest] = state;
      articulationPointsRecurse(a, t.dest, d + 1, depth, low, parent, visited, points);
      childCount++;
      if (low[t.dest] >= depth[state]) {
        isArticulation = true;
      }
      low[state] = Math.min(low[state], low[t.dest]);
    } else if (t.dest != parent[state]) {
      low[state] = Math.min(low[state], depth[t.dest]);
    }
  }
  if ((parent[state] != -1 && isArticulation) || (parent[state] == -1 && childCount > 1)) {
    points.add(state);
  }
}

Source File: TermAutomatonQuery.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Call this once you are done adding states/transitions.
 * @param maxDeterminizedStates Maximum number of states created when
 *   determinizing the automaton.  Higher numbers allow this operation to
 *   consume more memory but allow more complex automatons.
 */
public void finish(int maxDeterminizedStates) {
  Automaton automaton = builder.finish();

  // System.out.println("before det:\n" + automaton.toDot());

  Transition t = new Transition();

  // TODO: should we add "eps back to initial node" for all states,
  // and det that?  then we don't need to revisit initial node at
  // every position?  but automaton could blow up?  And, this makes it
  // harder to skip useless positions at search time?

  if (anyTermID != -1) {

    // Make sure there are no leading or trailing ANY:
    int count = automaton.initTransition(0, t);
    for(int i=0;i<count;i++) {
      automaton.getNextTransition(t);
      if (anyTermID >= t.min && anyTermID <= t.max) {
        throw new IllegalStateException("automaton cannot lead with an ANY transition");
      }
    }

    int numStates = automaton.getNumStates();
    for(int i=0;i<numStates;i++) {
      count = automaton.initTransition(i, t);
      for(int j=0;j<count;j++) {
        automaton.getNextTransition(t);
        if (automaton.isAccept(t.dest) && anyTermID >= t.min && anyTermID <= t.max) {
          throw new IllegalStateException("automaton cannot end with an ANY transition");
        }
      }
    }

    int termCount = termToID.size();

    // We have to carefully translate these transitions so automaton
    // realizes they also match all other terms:
    Automaton newAutomaton = new Automaton();
    for(int i=0;i<numStates;i++) {
      newAutomaton.createState();
      newAutomaton.setAccept(i, automaton.isAccept(i));
    }

    for(int i=0;i<numStates;i++) {
      count = automaton.initTransition(i, t);
      for(int j=0;j<count;j++) {
        automaton.getNextTransition(t);
        int min, max;
        if (t.min <= anyTermID && anyTermID <= t.max) {
          // Match any term
          min = 0;
          max = termCount-1;
        } else {
          min = t.min;
          max = t.max;
        }
        newAutomaton.addTransition(t.source, t.dest, min, max);
      }
    }
    newAutomaton.finishState();
    automaton = newAutomaton;
  }

  det = Operations.removeDeadStates(Operations.determinize(automaton,
    maxDeterminizedStates));

  if (det.isAccept(0)) {
    throw new IllegalStateException("cannot accept the empty string");
  }
}

Source File: TermAutomatonQuery.java From lucene-solr with Apache License 2.0

4 votes

public Query rewrite(IndexReader reader) throws IOException {
  if (Operations.isEmpty(det)) {
    return new MatchNoDocsQuery();
  }

  IntsRef single = Operations.getSingleton(det);
  if (single != null && single.length == 1) {
    return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset])));
  }

  // TODO: can PhraseQuery really handle multiple terms at the same position?  If so, why do we even have MultiPhraseQuery?
  
  // Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage:
  MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder();
  PhraseQuery.Builder pq = new PhraseQuery.Builder();

  Transition t = new Transition();
  int state = 0;
  int pos = 0;
  query:
  while (true) {
    int count = det.initTransition(state, t);
    if (count == 0) {
      if (det.isAccept(state) == false) {
        mpq = null;
        pq = null;
      }
      break;
    } else if (det.isAccept(state)) {
      mpq = null;
      pq = null;
      break;
    }
    int dest = -1;
    List<Term> terms = new ArrayList<>();
    boolean matchesAny = false;
    for(int i=0;i<count;i++) {
      det.getNextTransition(t);
      if (i == 0) {
        dest = t.dest;
      } else if (dest != t.dest) {
        mpq = null;
        pq = null;
        break query;
      }

      matchesAny |= anyTermID >= t.min && anyTermID <= t.max;

      if (matchesAny == false) {
        for(int termID=t.min;termID<=t.max;termID++) {
          terms.add(new Term(field, idToTerm.get(termID)));
        }
      }
    }
    if (matchesAny == false) {
      mpq.add(terms.toArray(new Term[terms.size()]), pos);
      if (pq != null) {
        if (terms.size() == 1) {
          pq.add(terms.get(0), pos);
        } else {
          pq = null;
        }
      }
    }
    state = dest;
    pos++;
  }

  if (pq != null) {
    return pq.build();
  } else if (mpq != null) {
    return mpq.build();
  }
  
  // TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"?
  return this;
}

org.apache.lucene.util.automaton.Transition Java Examples