org.apache.lucene.util.fst.Util Java Exaples

Source File: DatawaveFieldIndexListIteratorJexl.java From datawave with Apache License 2.0

6 votes

public static FST<?> getFST(SortedSet<String> values) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    // The builder options with defaults
    FST.INPUT_TYPE inputType = FST.INPUT_TYPE.BYTE1;
    int minSuffixCount1 = 0;
    int minSuffixCount2 = 0;
    boolean doShareSuffix = true;
    boolean doShareNonSingletonNodes = true;
    int shareMaxTailLength = Integer.MAX_VALUE;
    
    boolean allowArrayArcs = true;
    int bytesPageBits = 15;
    final Outputs<Object> outputs = NoOutputs.getSingleton();
    
    // create the FST from the values
    org.apache.lucene.util.fst.Builder<Object> fstBuilder = new org.apache.lucene.util.fst.Builder<>(inputType, minSuffixCount1, minSuffixCount2,
                    doShareSuffix, doShareNonSingletonNodes, shareMaxTailLength, outputs, allowArrayArcs, bytesPageBits);
    
    for (String value : values) {
        Util.toUTF16(value, irBuilder);
        final IntsRef scratchInt = irBuilder.get();
        fstBuilder.add(scratchInt, outputs.getNoOutput());
    }
    return fstBuilder.finish();
}

Source File: Dictionary.java From lucene-solr with Apache License 2.0

6 votes

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();
  
  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }
  
  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }
  
  return fstCompiler.compile();
}

Source File: NormalizeCharMap.java From lucene-solr with Apache License 2.0

6 votes

/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = fstCompiler.compile();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}

Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0

6 votes

@Override
public ClassificationResult<Boolean> assignClass(String text)
        throws IOException {
  Long output = 0L;
  try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
    CharTermAttribute charTermAttribute = tokenStream
            .addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
      String s = charTermAttribute.toString();
      Long d = Util.get(fst, new BytesRef(s));
      if (d != null) {
        output += d;
      }
    }
    tokenStream.end();
  }

  double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias);
  return new ClassificationResult<>(output >= bias, score);
}

Source File: NRTSuggesterBuilder.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Writes all the entries for the FST input term
 */
public void finishTerm() throws IOException {
  int numArcs = 0;
  int numDedupBytes = 1;
  analyzed.grow(analyzed.length() + 1);
  analyzed.setLength(analyzed.length() + 1);
  for (Entry entry : entries) {
    if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
      analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
      analyzed.grow(analyzed.length() + 1);
      analyzed.setLength(analyzed.length() + 1);
      numArcs = 0;
      numDedupBytes++;
    }
    analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
  }
  maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
  entries.clear();
}

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

6 votes

public void testFiniteStringsEatsStack() {
  char[] chars = new char[50000];
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString1 = new String(chars);
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString2 = new String(chars);
  Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
  Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
}

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

public void finishTerm(long defaultWeight) throws IOException {
    ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
    int deduplicator = 0;
    analyzed.append((byte) 0);
    analyzed.setLength(analyzed.length() + 1);
    analyzed.grow(analyzed.length());
    for (int i = 0; i < count; i++) {
        analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
        Util.toIntsRef(analyzed.get(), scratchInts);
        SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
        long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
        builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
    }
    seenSurfaceForms.clear();
    count = 0;
}

Source File: TrieBuilder.java From ambiverse-nlu with Apache License 2.0

6 votes

public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}

Source File: TestAutomaton.java From lucene-solr with Apache License 2.0

5 votes

public void testMakeBinaryIntervalRandom() throws Exception {
  int iters = atLeast(100);
  for(int iter=0;iter<iters;iter++) {
    BytesRef minTerm = TestUtil.randomBinaryTerm(random());
    boolean minInclusive = random().nextBoolean();
    BytesRef maxTerm = TestUtil.randomBinaryTerm(random());
    boolean maxInclusive = random().nextBoolean();

    Automaton a = makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive);

    for(int iter2=0;iter2<500;iter2++) {
      BytesRef term = TestUtil.randomBinaryTerm(random());
      int minCmp = minTerm.compareTo(term);
      int maxCmp = maxTerm.compareTo(term);

      boolean expected;
      if (minCmp > 0 || maxCmp < 0) {
        expected = false;
      } else if (minCmp == 0 && maxCmp == 0) {
        expected = minInclusive && maxInclusive;
      } else if (minCmp == 0) {
        expected = minInclusive;
      } else if (maxCmp == 0) {
        expected = maxInclusive;
      } else {
        expected = true;
      }

      if (VERBOSE) {
        System.out.println("  check term=" + term + " expected=" + expected);
      }
      IntsRefBuilder intsBuilder = new IntsRefBuilder();
      Util.toIntsRef(term, intsBuilder);
      assertEquals(expected, Operations.run(a, intsBuilder.toIntsRef()));
    }
  }
}

Source File: TestAutomaton.java From lucene-solr with Apache License 2.0

5 votes

private void assertMatches(Automaton a, String... strings) {
  Set<IntsRef> expected = new HashSet<>();
  for(String s : strings) {
    IntsRefBuilder ints = new IntsRefBuilder();
    expected.add(Util.toUTF32(s, ints));
  }

  assertEquals(expected, TestOperations.getFiniteStrings(
      Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES)));
}

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

public void testShortAccept() {
  Automaton a = Operations.union(Automata.makeString("x"), Automata.makeString("xy"));
  a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  IntsRefBuilder x = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("x"), x);
  assertTrue(actual.contains(x.get()));
  IntsRefBuilder xy = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("xy"), xy);
  assertTrue(actual.contains(xy.get()));
}

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

public void testSingletonNoLimit() {
  Automaton a = Automata.makeString("foobar");
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(1, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
  assertTrue(actual.contains(scratch.get()));
}

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

5 votes

public void testSingleton() throws Exception {
  int iters = atLeast(100);
  for(int iter=0;iter<iters;iter++) {
    String s = TestUtil.randomRealisticUnicodeString(random());
    Automaton a = Automata.makeString(s);
    Automaton utf8 = new UTF32ToUTF8().convert(a);
    IntsRefBuilder ints = new IntsRefBuilder();
    Util.toIntsRef(new BytesRef(s), ints);
    Set<IntsRef> set = new HashSet<>();
    set.add(ints.get());
    assertEquals(set, TestOperations.getFiniteStrings(utf8));
  }
}

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Basic test for getFiniteStrings
 */
public void testFiniteStringsBasic() {
  Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck"));
  a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertFiniteStringsRecursive(a, actual);
  assertEquals(2, actual.size());
  IntsRefBuilder dog = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("dog"), dog);
  assertTrue(actual.contains(dog.get()));
  IntsRefBuilder duck = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("duck"), duck);
  assertTrue(actual.contains(duck.get()));
}

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
  BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
  Set<String> paths = new HashSet<>();
  for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
    paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
  }
  return paths;
}

Source File: BlockTreeTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}

Source File: BaseTokenStreamTestCase.java From lucene-solr with Apache License 2.0

5 votes

/** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
  Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
  Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
  BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
  Set<String> paths = new HashSet<>();
  for (IntsRef ir: actualStringPaths) {
    paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
  }
  return paths;
}

Source File: DatawaveArithmetic.java From datawave with Apache License 2.0

5 votes

public static boolean matchesFst(Object object, FST fst) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    Util.toUTF16(object.toString(), irBuilder);
    final IntsRef ints = irBuilder.get();
    synchronized (fst) {
        return Util.get(fst, ints) != null;
    }
}

Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0

5 votes

private void updateFST(SortedMap<String, Double> weights) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String, Double> entry : weights.entrySet()) {
    scratchBytes.copyChars(entry.getKey());
    fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
            .getValue().longValue());
  }
  fst = fstCompiler.compile();
}

Source File: LimitedFiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

public void testSingleton() {
  Automaton a = Automata.makeString("foobar");
  List<IntsRef> actual = getFiniteStrings(new LimitedFiniteStringsIterator(a, 1));
  assertEquals(1, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
  assertTrue(actual.contains(scratch.get()));
}

Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0

5 votes

private void updateWeights(IndexReader indexReader,
                           int docId, Boolean assignedClass, SortedMap<String, Double> weights,
                           double modifier, boolean updateFST) throws IOException {
  TermsEnum cte = textTerms.iterator();

  // get the doc term vectors
  Terms terms = indexReader.getTermVector(docId, textFieldName);

  if (terms == null) {
    throw new IOException("term vectors must be stored for field "
            + textFieldName);
  }

  TermsEnum termsEnum = terms.iterator();

  BytesRef term;

  while ((term = termsEnum.next()) != null) {
    cte.seekExact(term);
    if (assignedClass != null) {
      long termFreqLocal = termsEnum.totalTermFreq();
      // update weights
      Long previousValue = Util.get(fst, term);
      String termString = term.utf8ToString();
      weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
    }
  }
  if (updateFST) {
    updateFST(weights);
  }
}

Source File: CompletionTokenStream.java From Elasticsearch with Apache License 2.0

5 votes

@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    if (finiteStrings == null) {
        Set<IntsRef> strings = toFiniteStrings.toFiniteStrings(input);

        if (strings.size() > MAX_PATHS) {
            throw new IllegalArgumentException("TokenStream expanded to " + strings.size() + " finite strings. Only <= " + MAX_PATHS
                    + " finite strings are supported");
        }
        posInc = strings.size();
        finiteStrings = strings.iterator();
    }
    if (finiteStrings.hasNext()) {
        posAttr.setPositionIncrement(posInc);
        /*
         * this posInc encodes the number of paths that this surface form
         * produced. Multi Fields have the same surface form and therefore sum up
         */
        posInc = 0;
        Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
        if (charTermAttribute != null) {
            charTermAttribute.setLength(0);
            charTermAttribute.append(bytesAtt.toUTF16());
        }
        if (payload != null) {
            payloadAttr.setPayload(this.payload);
        }
        return true;
    }

    return false;
}

Source File: ConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public boolean incrementToken() throws IOException {
  if (finiteStrings == null) {
    if (wasReset == false) {
      throw new IllegalStateException("reset() missing before incrementToken");
    }
    // lazy init/consume
    Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
    finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
    endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
  }

  IntsRef string = finiteStrings.next();
  if (string == null) {
    return false;
  }

  clearAttributes();

  if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
    posIncrAtt.setPositionIncrement(0); // stacked
  }

  offsetAtt.setOffset(0, endOffset);

  Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
  if (charTermAttribute != null) {
    charTermAttribute.setLength(0);
    charTermAttribute.append(bytesAtt.toUTF16());
  }

  return true;
}

Source File: Dictionary.java From lucene-solr with Apache License 2.0

5 votes

private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
  IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
  FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
    Util.toUTF32(entry.getKey(), scratch);
    List<Integer> entries = entry.getValue();
    IntsRef output = new IntsRef(entries.size());
    for (Integer c : entries) {
      output.ints[output.length++] = c;
    }
    fstCompiler.add(scratch.get(), output);
  }
  return fstCompiler.compile();
}

Source File: VersionBlockTreeTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

5 votes

@Override
public String toString(String field) {
  StringBuilder buffer = new StringBuilder();
  BytesRefBuilder scratch = new BytesRefBuilder();
  for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
    if (buffer.length() != 0) {
      buffer.append(",");
    } else {
      buffer.append("contexts");
      buffer.append(":[");
    }
    buffer.append(Util.toBytesRef(entry.getKey(), scratch).utf8ToString());
    ContextMetaData metaData = entry.getValue();
    if (metaData.exact == false) {
      buffer.append("*");
    }
    if (metaData.boost != 0) {
      buffer.append("^");
      buffer.append(Float.toString(metaData.boost));
    }
  }
  if (buffer.length() != 0) {
    buffer.append("]");
    buffer.append(",");
  }
  return buffer.toString() + innerQuery.toString(field);
}

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Adds a context with boost, set <code>exact</code> to false
 * if the context is a prefix of any indexed contexts
 */
public void addContext(CharSequence context, float boost, boolean exact) {
  if (boost < 0f) {
    throw new IllegalArgumentException("'boost' must be >= 0");
  }
  for (int i = 0; i < context.length(); i++) {
    if (ContextSuggestField.CONTEXT_SEPARATOR == context.charAt(i)) {
      throw new IllegalArgumentException("Illegal value [" + context + "] UTF-16 codepoint [0x"
          + Integer.toHexString((int) context.charAt(i))+ "] at position " + i + " is a reserved character");
    }
  }
  contexts.put(IntsRef.deepCopyOf(Util.toIntsRef(new BytesRef(context), scratch)), new ContextMetaData(boost, exact));
  updateRamBytesUsed();
}

Source File: FSTTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
  // write term meta data into fst
  final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
  meta.bytes = null;
  meta.docFreq = state.docFreq;
  meta.totalTermFreq = state.totalTermFreq;
  postingsWriter.encodeTerm(metaWriter, fieldInfo, state, true);
  if (metaWriter.size() > 0) {
    meta.bytes = metaWriter.toArrayCopy();
    metaWriter.reset();
  }
  fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
  numTerms++;
}

Source File: BaseSynonymParserTestCase.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Helper method to validate synonym parsing.
 *
 * @param synonynMap  the generated synonym map after parsing
 * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
 *                    All spaces will be replaced by word separators.
 * @param includeOrig if synonyms should include original
 * @param synonyms    actual synonyms. All word separators are replaced with a single space.
 */
public static void assertEntryEquals(SynonymMap synonynMap, String word, boolean includeOrig, String[] synonyms)
    throws Exception {
  word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
  BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
  assertNotNull("No synonyms found for: " + word, value);

  ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
  final int code = bytesReader.readVInt();

  final boolean keepOrig = (code & 0x1) == 0;
  assertEquals("Include original different than expected. Expected " + includeOrig + " was " + keepOrig,
      includeOrig, keepOrig);

  final int count = code >>> 1;
  assertEquals("Invalid synonym count. Expected " + synonyms.length + " was " + count,
      synonyms.length, count);

  Set<String> synonymSet = new HashSet<>(Arrays.asList(synonyms));

  BytesRef scratchBytes = new BytesRef();
  for (int i = 0; i < count; i++) {
    synonynMap.words.get(bytesReader.readVInt(), scratchBytes);
    String synonym = scratchBytes.utf8ToString().replace(SynonymMap.WORD_SEPARATOR, ' ');
    assertTrue("Unexpected synonym found: " + synonym, synonymSet.contains(synonym));
  }
}

Source File: FSTTermsReader.java From lucene-solr with Apache License 2.0

5 votes

/** Load frame for target arc(node) on fst, so that 
 *  arc.label &gt;= label and !fsa.reject(arc.label) */
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
  FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
  arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
  if (arc == null) {
    return null;
  }
  frame.fsaState = fsa.step(top.fsaState, arc.label());
  //if (TEST) System.out.println(" loadCeil frame="+frame);
  if (frame.fsaState == -1) {
    return loadNextFrame(top, frame);
  }
  frame.output = frame.fstArc.output();
  return frame;
}

org.apache.lucene.util.fst.Util Java Examples