org.apache.lucene.util.CharsRef Java Examples
The following examples show how to use
org.apache.lucene.util.CharsRef.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CharSequenceOutputs.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public CharsRef add(CharsRef prefix, CharsRef output) { assert prefix != null; assert output != null; if (prefix == NO_OUTPUT) { return output; } else if (output == NO_OUTPUT) { return prefix; } else { assert prefix.length > 0; assert output.length > 0; CharsRef result = new CharsRef(prefix.length + output.length); System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length); System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length); result.length = prefix.length + output.length; return result; } }
Example #2
Source File: Dictionary.java From lucene-solr with Apache License 2.0 | 6 votes |
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue())); } return fstCompiler.compile(); }
Example #3
Source File: Stemmer.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
Example #4
Source File: NormalizeCharMap.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = fstCompiler.compile(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
Example #5
Source File: NormalizeCharMap.java From lucene-solr with Apache License 2.0 | 6 votes |
private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: final FST.Arc<CharsRef> scratchArc = new FST.Arc<>(); final FST.BytesReader fstReader = map.getBytesReader(); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader); while(true) { assert scratchArc.label() != FST.END_LABEL; cachedRootArcs.put(Character.valueOf((char) scratchArc.label()), new FST.Arc<CharsRef>().copyFrom(scratchArc)); if (scratchArc.isLast()) { break; } map.readNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } } }
Example #6
Source File: WordnetSynonymParser.java From lucene-solr with Apache License 2.0 | 6 votes |
private void addInternal(CharsRef synset[], int size) { if (size <= 1) { return; // nothing to do } if (expand) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { if (i != j) { add(synset[i], synset[j], true); } } } } else { for (int i = 0; i < size; i++) { add(synset[i], synset[0], false); } } }
Example #7
Source File: TestFSTDirectAddressing.java From lucene-solr with Apache License 2.0 | 6 votes |
private static void recompileAndWalk(String fstFilePath) throws IOException { try (InputStreamDataInput in = new InputStreamDataInput(newInputStream(Paths.get(fstFilePath)))) { System.out.println("Reading FST"); long startTimeMs = System.currentTimeMillis(); FST<CharsRef> originalFst = new FST<>(in, in, CharSequenceOutputs.getSingleton()); long endTimeMs = System.currentTimeMillis(); System.out.println("time = " + (endTimeMs - startTimeMs) + " ms"); for (float oversizingFactor : List.of(0f, 0f, 0f, 1f, 1f, 1f)) { System.out.println("\nFST construction (oversizingFactor=" + oversizingFactor + ")"); startTimeMs = System.currentTimeMillis(); FST<CharsRef> fst = recompile(originalFst, oversizingFactor); endTimeMs = System.currentTimeMillis(); System.out.println("time = " + (endTimeMs - startTimeMs) + " ms"); System.out.println("FST RAM = " + fst.ramBytesUsed() + " B"); System.out.println("FST enum"); startTimeMs = System.currentTimeMillis(); walk(fst); endTimeMs = System.currentTimeMillis(); System.out.println("time = " + (endTimeMs - startTimeMs) + " ms"); } } }
Example #8
Source File: CharSequenceOutputs.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public CharsRef subtract(CharsRef output, CharsRef inc) { assert output != null; assert inc != null; if (inc == NO_OUTPUT) { // no prefix removed return output; } else if (inc.length == output.length) { // entire output removed return NO_OUTPUT; } else { assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; assert inc.length > 0; return new CharsRef(output.chars, output.offset + inc.length, output.length-inc.length); } }
Example #9
Source File: SynonymMap.java From lucene-solr with Apache License 2.0 | 6 votes |
/** only used for asserting! */ private boolean hasHoles(CharsRef chars) { final int end = chars.offset + chars.length; for(int idx=chars.offset+1;idx<end;idx++) { if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) { return true; } } if (chars.chars[chars.offset] == '\u0000') { return true; } if (chars.chars[chars.offset + chars.length - 1] == '\u0000') { return true; } return false; }
Example #10
Source File: SynonymMap.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Sugar: just joins the provided terms with {@link * SynonymMap#WORD_SEPARATOR}. reuse and its chars * must not be null. */ public static CharsRef join(String[] words, CharsRefBuilder reuse) { int upto = 0; char[] buffer = reuse.chars(); for (String word : words) { final int wordLen = word.length(); final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR if (needed > buffer.length) { reuse.grow(needed); buffer = reuse.chars(); } if (upto > 0) { buffer[upto++] = SynonymMap.WORD_SEPARATOR; } word.getChars(0, wordLen, buffer, upto); upto += wordLen; } reuse.setLength(upto); return reuse.get(); }
Example #11
Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
@Test public void testSeparatorWithSynonyms() throws IOException { SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true); builder.add(new CharsRef("mykeyword"), new CharsRef("three words synonym"), true); Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = " mykeyword another keyword "; tokenizer.setReader(new StringReader(input)); SynonymGraphFilter filter = new SynonymGraphFilter(tokenizer, builder.build(), true); ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, '-', false, 100); assertTokenStreamContents(stream, new String[] { "mykeyword-another-keyword", "mysynonym-another-keyword", "three words synonym-another-keyword" }, null, null, new int[] { 1, 0 ,0}); }
Example #12
Source File: DaciukMihovAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */ public static Automaton build(Collection<BytesRef> input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); char[] chars = new char[0]; CharsRef ref = new CharsRef(); for (BytesRef b : input) { chars = ArrayUtil.grow(chars, b.length); final int len = UnicodeUtil.UTF8toUTF16(b, chars); ref.chars = chars; ref.length = len; builder.add(ref); } Automaton.Builder a = new Automaton.Builder(); convert(a, builder.complete(), new IdentityHashMap<State,Integer>()); return a.finish(); }
Example #13
Source File: TestLimitTokenPositionFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMaxPosition3WithSynomyms() throws IOException { for (final boolean consumeAll : new boolean[]{true, false}) { MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five"); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.setEnableChecks(consumeAll); SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("one"), new CharsRef("first"), true); builder.add(new CharsRef("one"), new CharsRef("alpha"), true); builder.add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRefBuilder multiWordCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef); builder.add(new CharsRef("one"), multiWordCharsRef.get(), true); SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); builder.add(new CharsRef("two"), multiWordCharsRef.get(), true); SynonymMap synonymMap = builder.build(); @SuppressWarnings("deprecation") TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3, consumeAll); // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. assertTokenStreamContents(stream, new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"}, new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0}); } }
Example #14
Source File: ESSolrSynonymParser.java From crate with Apache License 2.0 | 5 votes |
@Override public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try { return super.analyze(text, reuse); } catch (IllegalArgumentException ex) { if (lenient) { LOGGER.info("Synonym rule for [" + text + "] was ignored"); return new CharsRef(""); } else { throw ex; } } }
Example #15
Source File: EnumField.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public CharsRef indexedToReadable(BytesRef input, CharsRefBuilder output) { final Integer intValue = LegacyNumericUtils.prefixCodedToInt(input); final String stringValue = enumMapping.intValueToStringValue(intValue); output.grow(stringValue.length()); output.setLength(stringValue.length()); stringValue.getChars(0, output.length(), output.chars(), 0); return output.get(); }
Example #16
Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0 | 5 votes |
private void addTerms( NamedList<NamedList<Number>> terms, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList<String> searchFields ) throws IOException { TermsResponse termsResponse = new TermsResponse( terms ); for (String fieldName : searchFields ) { CharsRef fieldChars = new CharsRef( fieldName ); List<TermsResponse.Term> termList = termsResponse.getTerms( fieldName ); if (termList != null) { for (TermsResponse.Term tc : termList) { String term = tc.getTerm(); Log.debug( "Add distributed term: " + fieldName + " = " + term ); addTerm( fieldChars, term, fieldBuilder, termBuilder ); } } } }
Example #17
Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0 | 5 votes |
private void buildFieldMap( ResponseBuilder rb ) throws IOException { Log.debug( "buildFieldMap" ); SolrIndexSearcher searcher = rb.req.getSearcher(); // build a synonym map from the SortedDocValues - // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true ); SynonymMap.Builder termBuilder = new SynonymMap.Builder( true ); ArrayList<String> searchFields = getStringFields( searcher ); for (String searchField : searchFields ) { Log.debug( "adding searchField " + searchField ); CharsRef fieldChars = new CharsRef( searchField ); SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField ); if (sdv == null) continue; Log.debug( "got SortedSetDocValues for " + searchField ); TermsEnum te = sdv.termsEnum(); while (te.next() != null) { BytesRef term = te.term(); String fieldValue = term.utf8ToString( ); addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder ); } } addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields ); fieldMap = fieldBuilder.build( ); termMap = termBuilder.build( ); }
Example #18
Source File: ESWordnetSynonymParser.java From crate with Apache License 2.0 | 5 votes |
@Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { // This condition follows up on the overridden analyze method. In case lenient was set to true and there was an // exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When // the synonym mappings for the words are added using the add method we skip the ones that were left empty by // analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The // else would happen only in the case when the input or output is empty and lenient is set, in which case we // quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal. if (lenient == false || (input.length > 0 && output.length > 0)) { super.add(input, output, includeOrig); } }
Example #19
Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0 | 5 votes |
private void addTerms( NamedList<NamedList<Number>> terms, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList<String> searchFields ) throws IOException { TermsResponse termsResponse = new TermsResponse( terms ); for (String fieldName : searchFields ) { CharsRef fieldChars = new CharsRef( fieldName ); List<TermsResponse.Term> termList = termsResponse.getTerms( fieldName ); if (termList != null) { for (TermsResponse.Term tc : termList) { String term = tc.getTerm(); Log.debug( "Add distributed term: " + fieldName + " = " + term ); addTerm( fieldChars, term, fieldBuilder, termBuilder ); } } } }
Example #20
Source File: SynonymFilter.java From elasticsearch-analysis-synonym with Apache License 2.0 | 5 votes |
public CharsRef pullNext() { assert upto < count; lastEndOffset = endOffsets[upto]; lastPosLength = posLengths[upto]; final CharsRefBuilder result = outputs[upto++]; posIncr = 0; if (upto == count) { reset(); } return result.get(); }
Example #21
Source File: DaciukMihovAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Add another character sequence to this automaton. The sequence must be * lexicographically larger or equal compared to any previous sequences added * to this automaton (the input must be sorted). */ public void add(CharsRef current) { if (current.length > MAX_TERM_LENGTH) { throw new IllegalArgumentException("This builder doesn't allow terms that are larger than 1,000 characters, got " + current); } assert stateRegistry != null : "Automaton already built."; assert previous == null || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: " + previous + " >= " + current;
Example #22
Source File: ManagedSynonymGraphFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Add the managed synonyms and their mappings into the SynonymMap builder. */ @Override public void parse(Reader in) throws IOException, ParseException { boolean ignoreCase = synonymManager.getIgnoreCase(); for (CasePreservedSynonymMappings cpsm : synonymManager.synonymMappings.values()) { for (Map.Entry<String, Set<String>> entry : cpsm.mappings.entrySet()) { for (String mapping : entry.getValue()) { // apply the case setting to match the behavior of the SynonymMap builder CharsRef casedTerm = analyze(synonymManager.applyCaseSetting(ignoreCase, entry.getKey()), new CharsRefBuilder()); CharsRef casedMapping = analyze(synonymManager.applyCaseSetting(ignoreCase, mapping), new CharsRefBuilder()); add(casedTerm, casedMapping, false); } } } }
Example #23
Source File: SuggestComponent.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Convert NamedList (suggester response) to {@link SuggesterResult} */ private SuggesterResult toSuggesterResult(Map<String, SimpleOrderedMap<NamedList<Object>>> suggestionsMap) { SuggesterResult result = new SuggesterResult(); if (suggestionsMap == null) { return result; } // for each token for(Map.Entry<String, SimpleOrderedMap<NamedList<Object>>> entry : suggestionsMap.entrySet()) { String suggesterName = entry.getKey(); for (Iterator<Map.Entry<String, NamedList<Object>>> suggestionsIter = entry.getValue().iterator(); suggestionsIter.hasNext();) { Map.Entry<String, NamedList<Object>> suggestions = suggestionsIter.next(); String tokenString = suggestions.getKey(); List<LookupResult> lookupResults = new ArrayList<>(); NamedList<Object> suggestion = suggestions.getValue(); // for each suggestion for (int j = 0; j < suggestion.size(); j++) { String property = suggestion.getName(j); if (property.equals(SuggesterResultLabels.SUGGESTIONS)) { @SuppressWarnings("unchecked") List<NamedList<Object>> suggestionEntries = (List<NamedList<Object>>) suggestion.getVal(j); for(NamedList<Object> suggestionEntry : suggestionEntries) { String term = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_TERM); Long weight = (Long) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_WEIGHT); String payload = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_PAYLOAD); LookupResult res = new LookupResult(new CharsRef(term), weight, new BytesRef(payload)); lookupResults.add(res); } } result.add(suggesterName, tokenString, lookupResults); } } } return result; }
Example #24
Source File: TestFSTDirectAddressing.java From lucene-solr with Apache License 2.0 | 5 votes |
private static int walk(FST<CharsRef> read) throws IOException { IntsRefFSTEnum<CharsRef> fstEnum = new IntsRefFSTEnum<>(read); IntsRefFSTEnum.InputOutput<CharsRef> inputOutput; int terms = 0; while ((inputOutput = fstEnum.next()) != null) { terms += inputOutput.input.length; terms += inputOutput.output.length; } return terms; }
Example #25
Source File: TestFSTDirectAddressing.java From lucene-solr with Apache License 2.0 | 5 votes |
private static FST<CharsRef> recompile(FST<CharsRef> fst, float oversizingFactor) throws IOException { FSTCompiler<CharsRef> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, CharSequenceOutputs.getSingleton()) .directAddressingMaxOversizingFactor(oversizingFactor) .build(); IntsRefFSTEnum<CharsRef> fstEnum = new IntsRefFSTEnum<>(fst); IntsRefFSTEnum.InputOutput<CharsRef> inputOutput; while ((inputOutput = fstEnum.next()) != null) { fstCompiler.add(inputOutput.input, CharsRef.deepCopyOf(inputOutput.output)); } return fstCompiler.compile(); }
Example #26
Source File: CharSequenceOutputs.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public CharsRef read(DataInput in) throws IOException { final int len = in.readVInt(); if (len == 0) { return NO_OUTPUT; } else { final CharsRef output = new CharsRef(len); for(int idx=0;idx<len;idx++) { output.chars[idx] = (char) in.readVInt(); } output.length = len; return output; } }
Example #27
Source File: CharSequenceOutputs.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public void write(CharsRef prefix, DataOutput out) throws IOException { assert prefix != null; out.writeVInt(prefix.length); // TODO: maybe UTF8? for(int idx=0;idx<prefix.length;idx++) { out.writeVInt(prefix.chars[prefix.offset+idx]); } }
Example #28
Source File: CharSequenceOutputs.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public CharsRef common(CharsRef output1, CharsRef output2) { assert output1 != null; assert output2 != null; int pos1 = output1.offset; int pos2 = output2.offset; int stopAt1 = pos1 + Math.min(output1.length, output2.length); while(pos1 < stopAt1) { if (output1.chars[pos1] != output2.chars[pos2]) { break; } pos1++; pos2++; } if (pos1 == output1.offset) { // no common prefix return NO_OUTPUT; } else if (pos1 == output1.offset + output1.length) { // output1 is a prefix of output2 return output1; } else if (pos2 == output2.offset + output2.length) { // output2 is a prefix of output1 return output2; } else { return new CharsRef(output1.chars, output1.offset, pos1-output1.offset); } }
Example #29
Source File: DaciukMihovAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Copy <code>current</code> into an internal buffer. */ private boolean setPrevious(CharsRef current) { // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 // still, called only from assert previous = CharsRef.deepCopyOf(current); return true; }
Example #30
Source File: ESWordnetSynonymParser.java From crate with Apache License 2.0 | 5 votes |
@Override public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try { return super.analyze(text, reuse); } catch (IllegalArgumentException ex) { if (lenient) { LOGGER.info("Synonym rule for [" + text + "] was ignored"); return new CharsRef(""); } else { throw ex; } } }