org.apache.lucene.util.AttributeSource Java Examples
The following examples show how to use
org.apache.lucene.util.AttributeSource.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OpenNLPPOSTaggerFilter.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
protected String[] walkTokens() throws IOException { List<String> wordList = new ArrayList<>(); while (input.incrementToken()) { CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class); char[] buffer = textAtt.buffer(); String word = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset()); wordList.add(word); AttributeSource attrs = input.cloneAttributes(); tokenAttrs.add(attrs); } String[] words = new String[wordList.size()]; for (int i = 0; i < words.length; i++) { words[i] = wordList.get(i); } return words; }
Example #2
Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) { final List<AttributeSource> tokens = new ArrayList<>(); try { tokenStream.reset(); CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { tokens.add(tokenStream.cloneAttributes()); List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt); result.add(new Token(charAtt.toString(), attributes)); } tokenStream.end(); } catch (IOException ioe) { throw new RuntimeException("Error occurred while iterating over TokenStream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }
Example #3
Source File: TestTermAutomatonQuery.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public boolean incrementToken() throws IOException { if (synNext) { AttributeSource.State state = captureState(); clearAttributes(); restoreState(state); posIncAtt.setPositionIncrement(0); termAtt.append(""+((char) 97 + random().nextInt(3))); synNext = false; return true; } if (input.incrementToken()) { if (random().nextInt(10) == 8) { synNext = true; } return true; } else { return false; } }
Example #4
Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { final List<AttributeSource> tokens = new ArrayList<>(); final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); try { tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { position += posIncrAtt.getPositionIncrement(); trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } tokenStream.end(); // TODO should we capture? } catch (IOException ioe) { throw new RuntimeException("Error occurred while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }
Example #5
Source File: FuzzyTermsEnum.java From lucene-solr with Apache License 2.0 | 6 votes |
private FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, Supplier<FuzzyAutomatonBuilder> automatonBuilder) throws IOException { this.terms = terms; this.atts = atts; this.term = term; this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); this.boostAtt = atts.addAttribute(BoostAttribute.class); atts.addAttributeImpl(new AutomatonAttributeImpl()); AutomatonAttribute aa = atts.addAttribute(AutomatonAttribute.class); aa.init(automatonBuilder); this.automata = aa.getAutomata(); this.termLength = aa.getTermLength(); this.maxEdits = this.automata.length - 1; bottom = maxBoostAtt.getMaxNonCompetitiveBoost(); bottomTerm = maxBoostAtt.getCompetitiveTerm(); bottomChanged(null); }
Example #6
Source File: ConcatenatingTokenStream.java From lucene-solr with Apache License 2.0 | 6 votes |
private static AttributeSource combineSources(TokenStream... sources) { AttributeSource base = sources[0].cloneAttributes(); try { for (int i = 1; i < sources.length; i++) { Iterator<Class<? extends Attribute>> it = sources[i].getAttributeClassesIterator(); while (it.hasNext()) { base.addAttribute(it.next()); } // check attributes can be captured sources[i].copyTo(base); } return base; } catch (IllegalArgumentException e) { throw new IllegalArgumentException("Attempted to concatenate TokenStreams with different attribute types", e); } }
Example #7
Source File: FuzzyQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (maxEdits == 0) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(), term.bytes()); } return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); }
Example #8
Source File: WordDelimiterFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override protected void swap(int i, int j) { AttributeSource.State tmp = buffered[i]; buffered[i] = buffered[j]; buffered[j] = tmp; int tmp2 = startOff[i]; startOff[i] = startOff[j]; startOff[j] = tmp2; tmp2 = posInc[i]; posInc[i] = posInc[j]; posInc[j] = tmp2; }
Example #9
Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public boolean incrementToken() { if (tokenIterator.hasNext()) { clearAttributes(); AttributeSource next = tokenIterator.next(); addAttributes(next); next.copyTo(this); return true; } else { return false; } }
Example #10
Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Creates a new ListBasedTokenStream which uses the given tokens as its token source. * * @param attributeSource source of the attribute factory and attribute impls * @param tokens Source of tokens to be used */ ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) { super(attributeSource.getAttributeFactory()); this.tokens = tokens; // Make sure all the attributes of the source are here too addAttributes(attributeSource); }
Example #11
Source File: TermsQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { if (this.terms.size() == 0) { return TermsEnum.EMPTY; } return new SeekingTermSetTermsEnum(terms.iterator(), this.terms, ords); }
Example #12
Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
private void collapseAndSaveTokens(int tokenType, String type) throws IOException { //collapse StringBuilder buffer = new StringBuilder(32); int numAdded = scanner.setText(buffer); //TODO: how to know how much whitespace to add int theStart = scanner.yychar(); int lastPos = theStart + numAdded; int tmpTokType; int numSeen = 0; List<AttributeSource.State> tmp = new ArrayList<>(); setupSavedToken(0, type); tmp.add(captureState()); //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){ int currPos = scanner.yychar(); //append whitespace for (int i = 0; i < (currPos - lastPos); i++){ buffer.append(' '); } numAdded = scanner.setText(buffer); setupSavedToken(scanner.getPositionIncrement(), type); tmp.add(captureState()); numSeen++; lastPos = currPos + numAdded; } //trim the buffer // TODO: this is inefficient String s = buffer.toString().trim(); termAtt.setEmpty().append(s); offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ scanner.yypushback(scanner.yylength()); } tokens = tmp.iterator(); }
Example #13
Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() throws IOException { if (tokens != null && tokens.hasNext()){ AttributeSource.State state = tokens.next(); restoreState(state); return true; } clearAttributes(); int tokenType = scanner.getNextToken(); if (tokenType == WikipediaTokenizerImpl.YYEOF) { return false; } String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]; if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){ setupToken(); } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){ collapseTokens(tokenType); } else if (tokenOutput == BOTH){ //collapse into a single token, add it to tokens AND output the individual tokens //output the untokenized Token first collapseAndSaveTokens(tokenType, type); } int posinc = scanner.getPositionIncrement(); if (first && posinc == 0) { posinc = 1; // don't emit posinc=0 for the first token! } posIncrAtt.setPositionIncrement(posinc); typeAtt.setType(type); first = false; return true; }
Example #14
Source File: TestRandomChains.java From lucene-solr with Apache License 2.0 | 5 votes |
static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) { Object[] args = new Object[paramTypes.length]; for (int i = 0; i < args.length; i++) { Class<?> paramType = paramTypes[i]; if (paramType == AttributeSource.class) { // TODO: args[i] = new AttributeSource(); // this is currently too scary to deal with! args[i] = null; // force IAE } else { args[i] = newRandomArg(random, paramType); } } return args; }
Example #15
Source File: TeeSinkTokenFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() { if (!it.hasNext()) { return false; } AttributeSource.State state = it.next(); restoreState(state); return true; }
Example #16
Source File: MockSynonymFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() throws IOException { if (tokenQueue.size() > 0) { tokenQueue.remove(0).copyTo(this); return true; } if (endOfInput == false && input.incrementToken()) { if (termAtt.toString().equals("dogs")) { addSynonymAndRestoreOrigToken("dog", 1, offsetAtt.endOffset()); } else if (termAtt.toString().equals("guinea")) { AttributeSource firstSavedToken = cloneAttributes(); if (input.incrementToken()) { if (termAtt.toString().equals("pig")) { AttributeSource secondSavedToken = cloneAttributes(); int secondEndOffset = offsetAtt.endOffset(); firstSavedToken.copyTo(this); addSynonym("cavy", 2, secondEndOffset); tokenQueue.add(secondSavedToken); } else if (termAtt.toString().equals("dogs")) { tokenQueue.add(cloneAttributes()); addSynonym("dog", 1, offsetAtt.endOffset()); } } else { endOfInput = true; } firstSavedToken.copyTo(this); } return true; } else { endOfInput = true; return false; } }
Example #17
Source File: NumericTokenizer.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** Make this tokenizer get attributes from the delegate token stream. */ private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) { return new AttributeFactory() { @Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { return (AttributeImpl) source.addAttribute(attClass); } }; }
Example #18
Source File: FieldInvertState.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Sets attributeSource to a new instance. */ void setAttributeSource(AttributeSource attributeSource) { if (this.attributeSource != attributeSource) { this.attributeSource = attributeSource; termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class); termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class); posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class); offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class); payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class); } }
Example #19
Source File: GraphTokenStreamFiniteStrings.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Returns the list of tokens that start at the provided state */ public List<AttributeSource> getTerms(int state) { int numT = det.initTransition(state, transition); List<AttributeSource> tokens = new ArrayList<> (); for (int i = 0; i < numT; i++) { det.getNextTransition(transition); tokens.addAll(Arrays.asList(this.tokens).subList(transition.min, transition.max + 1)); } return tokens; }
Example #20
Source File: LegacyNumericRangeQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") protected TermsEnum getTermsEnum(final Terms terms, AttributeSource atts) throws IOException { // very strange: java.lang.Number itself is not Comparable, but all subclasses used here are if (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) { return TermsEnum.EMPTY; } return new NumericRangeTermsEnum(terms.iterator()); }
Example #21
Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Creates a new ListBasedTokenStream which uses the given tokens as its token source. * * @param attributeSource source of the attribute factory and attribute impls * @param tokens Source of tokens to be used */ ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) { super(attributeSource.getAttributeFactory()); this.tokens = tokens; // Make sure all the attributes of the source are here too addAttributes(attributeSource); }
Example #22
Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public boolean incrementToken() { if (tokenIterator.hasNext()) { clearAttributes(); AttributeSource next = tokenIterator.next(); addAttributes(next); // just in case there were delayed attribute additions next.copyTo(this); return true; } else { return false; } }
Example #23
Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0 | 5 votes |
protected void addAttributes(AttributeSource attributeSource) { // note: ideally we wouldn't call addAttributeImpl which is marked internal. But nonetheless it's possible // this method is used by some custom attributes, especially since Solr doesn't provide a way to customize the // AttributeFactory which is the recommended way to choose which classes implement which attributes. Iterator<AttributeImpl> atts = attributeSource.getAttributeImplsIterator(); while (atts.hasNext()) { addAttributeImpl(atts.next()); // adds both impl & interfaces } }
Example #24
Source File: PreAnalyzedField.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() { if (!it.hasNext()) { return false; } AttributeSource.State state = it.next(); restoreState(state.clone()); // TODO: why can't I lookup the OffsetAttribute up in ctor instead? lastEndOffset = addAttribute(OffsetAttribute.class).endOffset(); return true; }
Example #25
Source File: Zemberek2DeASCIIfyFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0 | 5 votes |
private boolean createToken(String synonym, AttributeSource.State current) { restoreState(current); termAttribute.setEmpty().append(synonym); typeAtt.setType(DEASCII_TOKEN_TYPE); posIncrAtt.setPositionIncrement(0); return true; }
Example #26
Source File: OpenNLPPOSTaggerFilter.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
@Override public boolean incrementToken() throws IOException { //clearAttributes(); if (first) { //gather all tokens from doc String[] words = walkTokens(); if (words.length == 0) { return false; } //tagging posTags = createTags(words); first = false; tokenIdx = 0; } if (tokenIdx == tokenAttrs.size()) { resetParams(); return false; } AttributeSource as = tokenAttrs.get(tokenIdx); Iterator<? extends Class<? extends Attribute>> it = as.getAttributeClassesIterator(); while (it.hasNext()) { Class<? extends Attribute> attrClass = it.next(); if (!hasAttribute(attrClass)) { addAttribute(attrClass); } } as.copyTo(this); MWEMetadata metadata = exitingPayload.getPayload() == null ? new MWEMetadata() : MWEMetadata.deserialize(exitingPayload.getPayload().utf8ToString()); metadata.addMetaData(MWEMetadataType.POS, posTags[tokenIdx]); exitingPayload.setPayload(new BytesRef(MWEMetadata.serialize(metadata))); tokenIdx++; return true; }
Example #27
Source File: MtasPreAnalyzedParser.java From mtas with Apache License 2.0 | 4 votes |
@Override public ParseResult parse(Reader reader, AttributeSource parent) throws IOException { ParseResult res = new ParseResult(); // get MtasUpdateRequestProcessorResult StringBuilder sb = new StringBuilder(); char[] buf = new char[128]; int cnt; while ((cnt = reader.read(buf)) > 0) { sb.append(buf, 0, cnt); } Iterator<MtasUpdateRequestProcessorResultItem> iterator; try ( MtasUpdateRequestProcessorResultReader result = new MtasUpdateRequestProcessorResultReader( sb.toString());) { iterator = result.getIterator(); if (iterator != null && iterator.hasNext()) { res.str = result.getStoredStringValue(); res.bin = result.getStoredBinValue(); } else { res.str = null; res.bin = null; result.close(); return res; } parent.clearAttributes(); while (iterator.hasNext()) { MtasUpdateRequestProcessorResultItem item = iterator.next(); if (item.tokenTerm != null) { CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class); catt.append(item.tokenTerm); } if (item.tokenFlags != null) { FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class); flags.setFlags(item.tokenFlags); } if (item.tokenPosIncr != null) { PositionIncrementAttribute patt = parent .addAttribute(PositionIncrementAttribute.class); patt.setPositionIncrement(item.tokenPosIncr); } if (item.tokenPayload != null) { PayloadAttribute p = parent.addAttribute(PayloadAttribute.class); p.setPayload(new BytesRef(item.tokenPayload)); } if (item.tokenOffsetStart != null && item.tokenOffsetEnd != null) { OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class); offset.setOffset(item.tokenOffsetStart, item.tokenOffsetEnd); } // capture state and add to result State state = parent.captureState(); res.states.add(state.clone()); // reset for reuse parent.clearAttributes(); } } catch (IOException e) { // ignore log.debug(e); } return res; }
Example #28
Source File: SingleTokenTokenizer.java From attic-polygene-java with Apache License 2.0 | 4 votes |
public SingleTokenTokenizer( AttributeSource source, Reader in ) { super( source, in ); }
Example #29
Source File: STMergingTermsEnum.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public AttributeSource attributes() { throw new UnsupportedOperationException(); }
Example #30
Source File: SolrRangeQuery.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public AttributeSource attributes() { return te.attributes(); }