org.apache.lucene.util.fst.Builder Java Examples
The following examples show how to use
org.apache.lucene.util.fst.Builder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TrieBuilder.java From ambiverse-nlu with Apache License 2.0 | 6 votes |
public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); long outputValue = 0; for (String mention : sortedStrings) { scratchBytes.copyChars(mention); try { builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++); } catch (java.lang.AssertionError ae) { logger.debug("Assertion error for mention " + mention); } } return builder.finish(); }
Example #2
Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0 | 5 votes |
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) { this.payloadSep = payloadSep; this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; this.hasPayloads = hasPayloads; surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm]; }
Example #3
Source File: FstDecompounder.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
private FST<Object> createGlueMorphemes(List<String> glue) throws IOException { for (int i = 0; i < glue.size(); i++) { glue.set(i, new StringBuilder(glue.get(i)).reverse().toString()); } Collections.sort(glue); final Builder<Object> builder = new Builder<>(INPUT_TYPE.BYTE4, NoOutputs.getSingleton()); final Object nothing = NoOutputs.getSingleton().getNoOutput(); IntsRefBuilder intsBuilder = new IntsRefBuilder(); for (String morpheme : glue) { fromUTF16ToUTF32(morpheme, intsBuilder); builder.add(intsBuilder.get(), nothing); } return builder.finish(); }
Example #4
Source File: FstCompiler.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 4 votes |
/** * * @param inputStream the input stream * @param outputStream the output stream * @throws IOException if compilation fails */ public void compile(InputStream inputStream, OutputStream outputStream) throws IOException { final HashSet<BytesRef> words = new HashSet<>(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); String line; String last = null; StringBuilder stringBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { if (line.indexOf('#') >= 0) { continue; } line = pattern.split(line)[0].trim(); line = line.toLowerCase(Locale.ROOT); if (line.equals(last)) { continue; } last = line; /* * Add the word to the hash set in left-to-right characters order and reversed * for easier matching later on. */ stringBuilder.setLength(0); stringBuilder.append(line); final int len = stringBuilder.length(); stringBuilder.append('>'); words.add(new BytesRef(stringBuilder)); stringBuilder.setLength(len); stringBuilder.reverse().append('<'); words.add(new BytesRef(stringBuilder)); } reader.close(); final BytesRef [] all = new BytesRef[words.size()]; words.toArray(all); Arrays.sort(all, BytesRef::compareTo); final Object nothing = NoOutputs.getSingleton().getNoOutput(); final Builder<Object> builder = new Builder<>(INPUT_TYPE.BYTE4, NoOutputs.getSingleton()); final IntsRefBuilder intsRef = new IntsRefBuilder(); for (BytesRef bytesRef : all) { intsRef.clear(); intsRef.copyUTF8Bytes(bytesRef); builder.add(intsRef.get(), nothing); } final FST<Object> fst = builder.finish(); try (OutputStreamDataOutput out = new OutputStreamDataOutput(outputStream)) { fst.save(out); } }