org.apache.lucene.util.automaton.CompiledAutomaton Java Exaples

Source File: SortedSetDocValues.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}

Source File: MultiTerms.java From lucene-solr with Apache License 2.0

6 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  final List<MultiTermsEnum.TermsEnumIndex> termsEnums = new ArrayList<>();
  for(int i=0;i<subs.length;i++) {
    final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm);
    if (termsEnum != null) {
      termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i));
    }
  }

  if (termsEnums.size() > 0) {
    return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY));
  } else {
    return TermsEnum.EMPTY;
  }
}

Source File: SortedDocValues.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}

Source File: TestIntervals.java From lucene-solr with Apache License 2.0

6 votes

public void testMultiTerm() throws IOException {
  RegExp re = new RegExp("p.*e");
  IntervalsSource source = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), re.toString());

  checkIntervals(source, "field1", 5, new int[][]{
      {},
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0 }
  });

  IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
    IntervalsSource s = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), 1, re.toString());
    for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
      s.intervals("field1", ctx);
    }
  });
  assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage());

  checkVisits(source, 1);
}

Source File: TestTermsEnum.java From lucene-solr with Apache License 2.0

5 votes

public void testIntersectRegexp() throws Exception {
  Directory d = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), d);
  Document doc = new Document();
  doc.add(newStringField("field", "foobar", Field.Store.NO));
  w.addDocument(doc);
  IndexReader r = w.getReader();
  Terms terms = MultiTerms.getTerms(r, "field");
  CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
  String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage();
  assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
  r.close();
  w.close();
  d.close();
}

Source File: AutomatonTermsEnum.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Construct an enumerator based upon an automaton, enumerating the specified
 * field, working on a supplied TermsEnum
 *
 * @lucene.experimental 
 * @param compiled CompiledAutomaton
 */
public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) {
  super(tenum);
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  this.finite = compiled.finite;
  this.runAutomaton = compiled.runAutomaton;
  assert this.runAutomaton != null;
  this.commonSuffixRef = compiled.commonSuffixRef;
  this.automaton = compiled.automaton;

  // No need to track visited states for a finite language without loops.
  visited = finite ? null : new short[runAutomaton.getSize()];
}

Source File: Terms.java From lucene-solr with Apache License 2.0

5 votes

/** Returns a TermsEnum that iterates over all terms and
 *  documents that are accepted by the provided {@link
 *  CompiledAutomaton}.  If the <code>startTerm</code> is
 *  provided then the returned enum will only return terms
 *  {@code > startTerm}, but you still must call
 *  next() first to get to the first term.  Note that the
 *  provided <code>startTerm</code> must be accepted by
 *  the automaton.
 *
 *  <p>This is an expert low-level API and will only work
 *  for {@code NORMAL} compiled automata.  To handle any
 *  compiled automata you should instead use
 *  {@link CompiledAutomaton#getTermsEnum} instead.
 *
 *  <p><b>NOTE</b>: the returned TermsEnum cannot seek</p>.
 */
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
  
  // TODO: could we factor out a common interface b/w
  // CompiledAutomaton and FST?  Then we could pass FST there too,
  // and likely speed up resolving terms to deleted docs ... but
  // AutomatonTermsEnum makes this tricky because of its on-the-fly cycle
  // detection
  
  // TODO: eventually we could support seekCeil/Exact on
  // the returned enum, instead of only being able to seek
  // at the start

  TermsEnum termsEnum = iterator();

  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }

  if (startTerm == null) {
    return new AutomatonTermsEnum(termsEnum, compiled);
  } else {
    return new AutomatonTermsEnum(termsEnum, compiled) {
      @Override
      protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
        if (term == null) {
          term = startTerm;
        }
        return super.nextSeekTerm(term);
      }
    };
  }
}

Source File: TermInSetQuery.java From lucene-solr with Apache License 2.0

5 votes

private ByteRunAutomaton asByteRunAutomaton() {
  TermIterator iterator = termData.iterator();
  List<Automaton> automata = new ArrayList<>();
  for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
    automata.add(Automata.makeBinary(term));
  }
  return new CompiledAutomaton(Operations.union(automata)).runAutomaton;

}

Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0

5 votes

CompiledAutomaton buildMaxEditAutomaton() {
  try {
    return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false);
  } catch (TooComplexToDeterminizeException e) {
    throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
  }
}

Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0

5 votes

CompiledAutomaton[] buildAutomatonSet() {
  CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
  for (int i = 0; i <= maxEdits; i++) {
    try {
      compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false);
    }
    catch (TooComplexToDeterminizeException e) {
      throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
    }
  }
  return compiled;
}

Source File: FieldReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) System.out.println("  FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm));
  //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
  // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
  // can we optimize knowing that...?
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
}

Source File: LuceneTestCase.java From lucene-solr with Apache License 2.0

5 votes

/** 
 * Terms api equivalency 
 */
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
  if (leftTerms == null || rightTerms == null) {
    assertNull(info, leftTerms);
    assertNull(info, rightTerms);
    return;
  }
  assertTermsStatisticsEquals(info, leftTerms, rightTerms);
  assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
  assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
  assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());

  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
  
  assertTermsSeekingEquals(info, leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
      }
    }
  }
}

Source File: TestTermsEnum.java From lucene-solr with Apache License 2.0

5 votes

private boolean accepts(CompiledAutomaton c, BytesRef b) {
  int state = 0;
  for(int idx=0;idx<b.length;idx++) {
    assertTrue(state != -1);
    state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff);
  }
  return c.runAutomaton.isAccept(state);
}

Source File: AssertingLeafReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
  TermsEnum termsEnum = in.intersect(automaton, bytes);
  assert termsEnum != null;
  assert bytes == null || bytes.isValid();
  return new AssertingTermsEnum(termsEnum, hasFreqs());
}

Source File: FSTTermsReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(compiled, startTerm);
}

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

5 votes

public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
  if (leftTerms == null || rightTerms == null) {
    assertNull(leftTerms);
    assertNull(rightTerms);
    return;
  }
  assertTermsStatistics(leftTerms, rightTerms);
  
  // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

  boolean bothHaveFreqs = leftTerms.hasFreqs() && rightTerms.hasFreqs();
  boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHaveFreqs, bothHavePositions);
  
  assertTermsSeeking(leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHaveFreqs, bothHavePositions);
      }
    }
  }
}

Source File: FSTTermsReader.java From lucene-solr with Apache License 2.0

5 votes

IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  super();
  //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
  this.fst = dict;
  this.fstReader = fst.getBytesReader();
  this.fstOutputs = dict.outputs;
  this.fsa = compiled.runAutomaton;
  this.level = -1;
  this.stack = new Frame[16];
  for (int i = 0 ; i < stack.length; i++) {
    this.stack[i] = new Frame();
  }

  loadVirtualFrame(newFrame());
  this.level++;
  pushFrame(loadFirstFrame(newFrame()));

  this.meta = null;
  this.metaUpto = 1;
  this.decoded = false;
  this.pending = false;

  if (startTerm == null) {
    pending = isAccept(topFrame());
  } else {
    doSeekCeil(startTerm);
    pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame());
  }
}

Source File: BlockTreeTermsReader.java From incubator-retired-blur with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectEnum(compiled, startTerm);
}

Source File: DirectPostingsFormat.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new DirectIntersectTermsEnum(compiled, startTerm);
}

Source File: IntersectBlockReader.java From lucene-solr with Apache License 2.0

5 votes

protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm,
                               IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput,
                               PostingsReaderBase postingsReader, FieldMetadata fieldMetadata,
                               BlockDecoder blockDecoder) throws IOException {
  super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
  automaton = compiled.automaton;
  runAutomaton = compiled.runAutomaton;
  finite = compiled.finite;
  commonSuffix = compiled.commonSuffixRef;
  minTermLength = getMinTermLength();
  nextStringCalculator = new AutomatonNextTermCalculator(compiled);
  seekTerm = startTerm;
}

Source File: MultiTermIntervalsSource.java From lucene-solr with Apache License 2.0

5 votes

MultiTermIntervalsSource(CompiledAutomaton automaton, int maxExpansions, String pattern) {
  this.automaton = automaton;
  if (maxExpansions > IndexSearcher.getMaxClauseCount()) {
    throw new IllegalArgumentException("maxExpansions [" + maxExpansions
        + "] cannot be greater than BooleanQuery.getMaxClauseCount [" + IndexSearcher.getMaxClauseCount() + "]");
  }
  this.maxExpansions = maxExpansions;
  this.pattern = pattern;
}

Source File: AssertingLeafReader.java From crate with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
    TermsEnum termsEnum = in.intersect(automaton, bytes);
    assert termsEnum != null;
    assert bytes == null || bytes.isValid();
    return new AssertingTermsEnum(termsEnum, hasFreqs());
}

Source File: OrdsFieldReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new OrdsIntersectTermsEnum(this, compiled, startTerm);
}

Source File: SecureAtomicReader.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  return new SecureTermsEnum(in.intersect(compiled, startTerm), _accessControlReader, _maxDoc);
}

Source File: IncludeExclude.java From Elasticsearch with Apache License 2.0

4 votes

private AutomatonBackedOrdinalsFilter(Automaton automaton) {
    this.compiled = new CompiledAutomaton(automaton);
}

Source File: FilterSortedDocValues.java From lucene-solr with Apache License 2.0

4 votes

@Override
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  return in.intersect(automaton);
}

Source File: SortingLeafReader.java From lucene-solr with Apache License 2.0

4 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm)
    throws IOException {
  return new SortingTermsEnum(in.intersect(compiled, startTerm), docMap, indexOptions, hasPositions());
}

Source File: SecureAtomicReader.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  TermsEnum maskTermsEnum = _readMask.intersect(compiled, startTerm);
  return new ReadMaskTermsEnum(maskTermsEnum, in.intersect(compiled, startTerm));
}

Source File: BlockTreeTermsReader.java From incubator-retired-blur with Apache License 2.0

4 votes

public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) {
  //   System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
  // }
  runAutomaton = compiled.runAutomaton;
  compiledAutomaton = compiled;
  in = BlockTreeTermsReader.this.in.clone();
  stack = new Frame[5];
  for(int idx=0;idx<stack.length;idx++) {
    stack[idx] = new Frame(idx);
  }
  for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
    arcs[arcIdx] = new FST.Arc<BytesRef>();
  }

  if (index == null) {
    fstReader = null;
  } else {
    fstReader = index.getBytesReader();
  }

  // TODO: if the automaton is "smallish" we really
  // should use the terms index to seek at least to
  // the initial term and likely to subsequent terms
  // (or, maybe just fallback to ATE for such cases).
  // Else the seek cost of loading the frames will be
  // too costly.

  final FST.Arc<BytesRef> arc = index.getFirstArc(arcs[0]);
  // Empty string prefix must have an output in the index!
  assert arc.isFinal();

  // Special pushFrame since it's the first one:
  final Frame f = stack[0];
  f.fp = f.fpOrig = rootBlockFP;
  f.prefix = 0;
  f.setState(runAutomaton.getInitialState());
  f.arc = arc;
  f.outputPrefix = arc.output;
  f.load(rootCode);

  // for assert:
  assert setSavedStartTerm(startTerm);

  currentFrame = f;
  if (startTerm != null) {
    seekToStartTerm(startTerm);
  }
}

Source File: ExitableReader.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  return new ExitableTermsEnum(_terms.intersect(compiled, startTerm), _exitObject);
}

org.apache.lucene.util.automaton.CompiledAutomaton Java Examples