org.apache.lucene.util.automaton.CompiledAutomaton Java Examples
The following examples show how to use
org.apache.lucene.util.automaton.CompiledAutomaton.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SortedSetDocValues.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton} * The enum supports {@link TermsEnum#ord()}. */ public TermsEnum intersect(CompiledAutomaton automaton) throws IOException { TermsEnum in = termsEnum(); switch (automaton.type) { case NONE: return TermsEnum.EMPTY; case ALL: return in; case SINGLE: return new SingleTermsEnum(in, automaton.term); case NORMAL: return new AutomatonTermsEnum(in, automaton); default: // unreachable throw new RuntimeException("unhandled case"); } }
Example #2
Source File: MultiTerms.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { final List<MultiTermsEnum.TermsEnumIndex> termsEnums = new ArrayList<>(); for(int i=0;i<subs.length;i++) { final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm); if (termsEnum != null) { termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); } } if (termsEnums.size() > 0) { return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); } else { return TermsEnum.EMPTY; } }
Example #3
Source File: SortedDocValues.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton} * The enum supports {@link TermsEnum#ord()}. */ public TermsEnum intersect(CompiledAutomaton automaton) throws IOException { TermsEnum in = termsEnum(); switch (automaton.type) { case NONE: return TermsEnum.EMPTY; case ALL: return in; case SINGLE: return new SingleTermsEnum(in, automaton.term); case NORMAL: return new AutomatonTermsEnum(in, automaton); default: // unreachable throw new RuntimeException("unhandled case"); } }
Example #4
Source File: TestIntervals.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMultiTerm() throws IOException { RegExp re = new RegExp("p.*e"); IntervalsSource source = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), re.toString()); checkIntervals(source, "field1", 5, new int[][]{ {}, { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 }, { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 }, { 7, 7 }, { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 }, { 0, 0 } }); IllegalStateException e = expectThrows(IllegalStateException.class, () -> { IntervalsSource s = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), 1, re.toString()); for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) { s.intervals("field1", ctx); } }); assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage()); checkVisits(source, 1); }
Example #5
Source File: TestTermsEnum.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testIntersectRegexp() throws Exception { Directory d = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), d); Document doc = new Document(); doc.add(newStringField("field", "foobar", Field.Store.NO)); w.addDocument(doc); IndexReader r = w.getReader(); Terms terms = MultiTerms.getTerms(r, "field"); CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton()); String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage(); assertEquals("please use CompiledAutomaton.getTermsEnum instead", message); r.close(); w.close(); d.close(); }
Example #6
Source File: AutomatonTermsEnum.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Construct an enumerator based upon an automaton, enumerating the specified * field, working on a supplied TermsEnum * * @lucene.experimental * @param compiled CompiledAutomaton */ public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) { super(tenum); if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } this.finite = compiled.finite; this.runAutomaton = compiled.runAutomaton; assert this.runAutomaton != null; this.commonSuffixRef = compiled.commonSuffixRef; this.automaton = compiled.automaton; // No need to track visited states for a finite language without loops. visited = finite ? null : new short[runAutomaton.getSize()]; }
Example #7
Source File: Terms.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Returns a TermsEnum that iterates over all terms and * documents that are accepted by the provided {@link * CompiledAutomaton}. If the <code>startTerm</code> is * provided then the returned enum will only return terms * {@code > startTerm}, but you still must call * next() first to get to the first term. Note that the * provided <code>startTerm</code> must be accepted by * the automaton. * * <p>This is an expert low-level API and will only work * for {@code NORMAL} compiled automata. To handle any * compiled automata you should instead use * {@link CompiledAutomaton#getTermsEnum} instead. * * <p><b>NOTE</b>: the returned TermsEnum cannot seek</p>. */ public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException { // TODO: could we factor out a common interface b/w // CompiledAutomaton and FST? Then we could pass FST there too, // and likely speed up resolving terms to deleted docs ... but // AutomatonTermsEnum makes this tricky because of its on-the-fly cycle // detection // TODO: eventually we could support seekCeil/Exact on // the returned enum, instead of only being able to seek // at the start TermsEnum termsEnum = iterator(); if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } if (startTerm == null) { return new AutomatonTermsEnum(termsEnum, compiled); } else { return new AutomatonTermsEnum(termsEnum, compiled) { @Override protected BytesRef nextSeekTerm(BytesRef term) throws IOException { if (term == null) { term = startTerm; } return super.nextSeekTerm(term); } }; } }
Example #8
Source File: TermInSetQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
private ByteRunAutomaton asByteRunAutomaton() { TermIterator iterator = termData.iterator(); List<Automaton> automata = new ArrayList<>(); for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { automata.add(Automata.makeBinary(term)); } return new CompiledAutomaton(Operations.union(automata)).runAutomaton; }
Example #9
Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 5 votes |
CompiledAutomaton buildMaxEditAutomaton() { try { return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false); } catch (TooComplexToDeterminizeException e) { throw new FuzzyTermsEnum.FuzzyTermsException(term, e); } }
Example #10
Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 5 votes |
CompiledAutomaton[] buildAutomatonSet() { CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1]; for (int i = 0; i <= maxEdits; i++) { try { compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false); } catch (TooComplexToDeterminizeException e) { throw new FuzzyTermsEnum.FuzzyTermsException(term, e); } } return compiled; }
Example #11
Source File: FieldReader.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { // if (DEBUG) System.out.println(" FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm)); //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton); // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum? // can we optimize knowing that...? if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm); }
Example #12
Source File: LuceneTestCase.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Terms api equivalency */ public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException { if (leftTerms == null || rightTerms == null) { assertNull(info, leftTerms); assertNull(info, rightTerms); return; } assertTermsStatisticsEquals(info, leftTerms, rightTerms); assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets()); assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions()); assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads()); TermsEnum leftTermsEnum = leftTerms.iterator(); TermsEnum rightTermsEnum = rightTerms.iterator(); assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true); assertTermsSeekingEquals(info, leftTerms, rightTerms); if (deep) { int numIntersections = atLeast(3); for (int i = 0; i < numIntersections; i++) { String re = AutomatonTestUtil.randomRegexp(random()); CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.intersect(automaton, null); TermsEnum rightIntersection = rightTerms.intersect(automaton, null); assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely()); } } } }
Example #13
Source File: TestTermsEnum.java From lucene-solr with Apache License 2.0 | 5 votes |
private boolean accepts(CompiledAutomaton c, BytesRef b) { int state = 0; for(int idx=0;idx<b.length;idx++) { assertTrue(state != -1); state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff); } return c.runAutomaton.isAccept(state); }
Example #14
Source File: AssertingLeafReader.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException { TermsEnum termsEnum = in.intersect(automaton, bytes); assert termsEnum != null; assert bytes == null || bytes.isValid(); return new AssertingTermsEnum(termsEnum, hasFreqs()); }
Example #15
Source File: FSTTermsReader.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new IntersectTermsEnum(compiled, startTerm); }
Example #16
Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0 | 5 votes |
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception { if (leftTerms == null || rightTerms == null) { assertNull(leftTerms); assertNull(rightTerms); return; } assertTermsStatistics(leftTerms, rightTerms); // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different boolean bothHaveFreqs = leftTerms.hasFreqs() && rightTerms.hasFreqs(); boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions(); TermsEnum leftTermsEnum = leftTerms.iterator(); TermsEnum rightTermsEnum = rightTerms.iterator(); assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHaveFreqs, bothHavePositions); assertTermsSeeking(leftTerms, rightTerms); if (deep) { int numIntersections = atLeast(3); for (int i = 0; i < numIntersections; i++) { String re = AutomatonTestUtil.randomRegexp(random()); CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton()); if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.intersect(automaton, null); TermsEnum rightIntersection = rightTerms.intersect(automaton, null); assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHaveFreqs, bothHavePositions); } } } }
Example #17
Source File: FSTTermsReader.java From lucene-solr with Apache License 2.0 | 5 votes |
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { super(); //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = dict; this.fstReader = fst.getBytesReader(); this.fstOutputs = dict.outputs; this.fsa = compiled.runAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0 ; i < stack.length; i++) { this.stack[i] = new Frame(); } loadVirtualFrame(newFrame()); this.level++; pushFrame(loadFirstFrame(newFrame())); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = isAccept(topFrame()); } else { doSeekCeil(startTerm); pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); } }
Example #18
Source File: BlockTreeTermsReader.java From incubator-retired-blur with Apache License 2.0 | 5 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new IntersectEnum(compiled, startTerm); }
Example #19
Source File: DirectPostingsFormat.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new DirectIntersectTermsEnum(compiled, startTerm); }
Example #20
Source File: IntersectBlockReader.java From lucene-solr with Apache License 2.0 | 5 votes |
protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm, IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput, PostingsReaderBase postingsReader, FieldMetadata fieldMetadata, BlockDecoder blockDecoder) throws IOException { super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder); automaton = compiled.automaton; runAutomaton = compiled.runAutomaton; finite = compiled.finite; commonSuffix = compiled.commonSuffixRef; minTermLength = getMinTermLength(); nextStringCalculator = new AutomatonNextTermCalculator(compiled); seekTerm = startTerm; }
Example #21
Source File: MultiTermIntervalsSource.java From lucene-solr with Apache License 2.0 | 5 votes |
MultiTermIntervalsSource(CompiledAutomaton automaton, int maxExpansions, String pattern) { this.automaton = automaton; if (maxExpansions > IndexSearcher.getMaxClauseCount()) { throw new IllegalArgumentException("maxExpansions [" + maxExpansions + "] cannot be greater than BooleanQuery.getMaxClauseCount [" + IndexSearcher.getMaxClauseCount() + "]"); } this.maxExpansions = maxExpansions; this.pattern = pattern; }
Example #22
Source File: AssertingLeafReader.java From crate with Apache License 2.0 | 5 votes |
@Override public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException { TermsEnum termsEnum = in.intersect(automaton, bytes); assert termsEnum != null; assert bytes == null || bytes.isValid(); return new AssertingTermsEnum(termsEnum, hasFreqs()); }
Example #23
Source File: OrdsFieldReader.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new OrdsIntersectTermsEnum(this, compiled, startTerm); }
Example #24
Source File: SecureAtomicReader.java From incubator-retired-blur with Apache License 2.0 | 4 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { return new SecureTermsEnum(in.intersect(compiled, startTerm), _accessControlReader, _maxDoc); }
Example #25
Source File: IncludeExclude.java From Elasticsearch with Apache License 2.0 | 4 votes |
private AutomatonBackedOrdinalsFilter(Automaton automaton) { this.compiled = new CompiledAutomaton(automaton); }
Example #26
Source File: FilterSortedDocValues.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TermsEnum intersect(CompiledAutomaton automaton) throws IOException { return in.intersect(automaton); }
Example #27
Source File: SortingLeafReader.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { return new SortingTermsEnum(in.intersect(compiled, startTerm), docMap, indexOptions, hasPositions()); }
Example #28
Source File: SecureAtomicReader.java From incubator-retired-blur with Apache License 2.0 | 4 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { TermsEnum maskTermsEnum = _readMask.intersect(compiled, startTerm); return new ReadMaskTermsEnum(maskTermsEnum, in.intersect(compiled, startTerm)); }
Example #29
Source File: BlockTreeTermsReader.java From incubator-retired-blur with Apache License 2.0 | 4 votes |
public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { // if (DEBUG) { // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); // } runAutomaton = compiled.runAutomaton; compiledAutomaton = compiled; in = BlockTreeTermsReader.this.in.clone(); stack = new Frame[5]; for(int idx=0;idx<stack.length;idx++) { stack[idx] = new Frame(idx); } for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { arcs[arcIdx] = new FST.Arc<BytesRef>(); } if (index == null) { fstReader = null; } else { fstReader = index.getBytesReader(); } // TODO: if the automaton is "smallish" we really // should use the terms index to seek at least to // the initial term and likely to subsequent terms // (or, maybe just fallback to ATE for such cases). // Else the seek cost of loading the frames will be // too costly. final FST.Arc<BytesRef> arc = index.getFirstArc(arcs[0]); // Empty string prefix must have an output in the index! assert arc.isFinal(); // Special pushFrame since it's the first one: final Frame f = stack[0]; f.fp = f.fpOrig = rootBlockFP; f.prefix = 0; f.setState(runAutomaton.getInitialState()); f.arc = arc; f.outputPrefix = arc.output; f.load(rootCode); // for assert: assert setSavedStartTerm(startTerm); currentFrame = f; if (startTerm != null) { seekToStartTerm(startTerm); } }
Example #30
Source File: ExitableReader.java From incubator-retired-blur with Apache License 2.0 | 4 votes |
@Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { return new ExitableTermsEnum(_terms.intersect(compiled, startTerm), _exitObject); }