org.apache.lucene.util.AttributeSource Java Exaples

Source File: OpenNLPPOSTaggerFilter.java From jate with GNU Lesser General Public License v3.0

6 votes

protected String[] walkTokens() throws IOException {
    List<String> wordList = new ArrayList<>();
    while (input.incrementToken()) {
        CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
        char[] buffer = textAtt.buffer();
        String word =  new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
        wordList.add(word);

        AttributeSource attrs = input.cloneAttributes();
        tokenAttrs.add(attrs);
    }
    String[] words = new String[wordList.size()];
    for (int i = 0; i < words.length; i++) {
        words[i] = wordList.get(i);
    }
    return words;
}

Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) {
  final List<AttributeSource> tokens = new ArrayList<>();
  try {
    tokenStream.reset();
    CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
      tokens.add(tokenStream.cloneAttributes());
      List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt);
      result.add(new Token(charAtt.toString(), attributes));
    }
    tokenStream.end();
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over TokenStream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }
  return tokens;
}

Source File: TestTermAutomatonQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
public boolean incrementToken() throws IOException {
  if (synNext) {
    AttributeSource.State state = captureState();
    clearAttributes();
    restoreState(state);
    posIncAtt.setPositionIncrement(0);
    termAtt.append(""+((char) 97 + random().nextInt(3)));
    synNext = false;
    return true;
  }

  if (input.incrementToken()) {
    if (random().nextInt(10) == 8) {
      synNext = true;
    }
    return true;
  } else {
    return false;
  }
}

Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}

Source File: FuzzyTermsEnum.java From lucene-solr with Apache License 2.0

6 votes

private FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, Supplier<FuzzyAutomatonBuilder> automatonBuilder) throws IOException {

    this.terms = terms;
    this.atts = atts;
    this.term = term;

    this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
    this.boostAtt = atts.addAttribute(BoostAttribute.class);

    atts.addAttributeImpl(new AutomatonAttributeImpl());
    AutomatonAttribute aa = atts.addAttribute(AutomatonAttribute.class);
    aa.init(automatonBuilder);

    this.automata = aa.getAutomata();
    this.termLength = aa.getTermLength();
    this.maxEdits = this.automata.length - 1;

    bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
    bottomTerm = maxBoostAtt.getCompetitiveTerm();
    bottomChanged(null);
  }

Source File: ConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

6 votes

private static AttributeSource combineSources(TokenStream... sources) {
  AttributeSource base = sources[0].cloneAttributes();
  try {
    for (int i = 1; i < sources.length; i++) {
      Iterator<Class<? extends Attribute>> it = sources[i].getAttributeClassesIterator();
      while (it.hasNext()) {
        base.addAttribute(it.next());
      }
      // check attributes can be captured
      sources[i].copyTo(base);
    }
    return base;
  }
  catch (IllegalArgumentException e) {
    throw new IllegalArgumentException("Attempted to concatenate TokenStreams with different attribute types", e);
  }
}

Source File: FuzzyQuery.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  if (maxEdits == 0) { // can only match if it's exact
    return new SingleTermsEnum(terms.iterator(), term.bytes());
  }
  return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
}

Source File: WordDelimiterFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected void swap(int i, int j) {
  AttributeSource.State tmp = buffered[i];
  buffered[i] = buffered[j];
  buffered[j] = tmp;
  
  int tmp2 = startOff[i];
  startOff[i] = startOff[j];
  startOff[j] = tmp2;
  
  tmp2 = posInc[i];
  posInc[i] = posInc[j];
  posInc[j] = tmp2;
}

Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0

5 votes

@Override
public boolean incrementToken() {
  if (tokenIterator.hasNext()) {
    clearAttributes();
    AttributeSource next = tokenIterator.next();
    addAttributes(next);
    next.copyTo(this);
    return true;
  } else {
    return false;
  }
}

Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
 *
 * @param attributeSource source of the attribute factory and attribute impls
 * @param tokens Source of tokens to be used
 */
ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) {
  super(attributeSource.getAttributeFactory());
  this.tokens = tokens;
  // Make sure all the attributes of the source are here too
  addAttributes(attributeSource);
}

Source File: TermsQuery.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  if (this.terms.size() == 0) {
    return TermsEnum.EMPTY;
  }

  return new SeekingTermSetTermsEnum(terms.iterator(), this.terms, ords);
}

Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0

5 votes

private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
  //collapse
  StringBuilder buffer = new StringBuilder(32);
  int numAdded = scanner.setText(buffer);
  //TODO: how to know how much whitespace to add
  int theStart = scanner.yychar();
  int lastPos = theStart + numAdded;
  int tmpTokType;
  int numSeen = 0;
  List<AttributeSource.State> tmp = new ArrayList<>();
  setupSavedToken(0, type);
  tmp.add(captureState());
  //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
  while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
    int currPos = scanner.yychar();
    //append whitespace
    for (int i = 0; i < (currPos - lastPos); i++){
      buffer.append(' ');
    }
    numAdded = scanner.setText(buffer);
    setupSavedToken(scanner.getPositionIncrement(), type);
    tmp.add(captureState());
    numSeen++;
    lastPos = currPos + numAdded;
  }
  //trim the buffer
  // TODO: this is inefficient
  String s = buffer.toString().trim();
  termAtt.setEmpty().append(s);
  offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
  flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
  //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
  if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
    scanner.yypushback(scanner.yylength());
  }
  tokens = tmp.iterator();
}

Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() throws IOException {
  if (tokens != null && tokens.hasNext()){
    AttributeSource.State state = tokens.next();
    restoreState(state);
    return true;
  }
  clearAttributes();
  int tokenType = scanner.getNextToken();

  if (tokenType == WikipediaTokenizerImpl.YYEOF) {
    return false;
  }
  String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
  if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
    setupToken();
  } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
    collapseTokens(tokenType);

  }
  else if (tokenOutput == BOTH){
    //collapse into a single token, add it to tokens AND output the individual tokens
    //output the untokenized Token first
    collapseAndSaveTokens(tokenType, type);
  }
  int posinc = scanner.getPositionIncrement();
  if (first && posinc == 0) {
    posinc = 1; // don't emit posinc=0 for the first token!
  }
  posIncrAtt.setPositionIncrement(posinc);
  typeAtt.setType(type);
  first = false;
  return true;
}

Source File: TestRandomChains.java From lucene-solr with Apache License 2.0

5 votes

static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) {
  Object[] args = new Object[paramTypes.length];
  for (int i = 0; i < args.length; i++) {
    Class<?> paramType = paramTypes[i];
    if (paramType == AttributeSource.class) {
      // TODO: args[i] = new AttributeSource();
      // this is currently too scary to deal with!
      args[i] = null; // force IAE
    } else {
      args[i] = newRandomArg(random, paramType);
    }
  }
  return args;
}

Source File: TeeSinkTokenFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }

  AttributeSource.State state = it.next();
  restoreState(state);
  return true;
}

Source File: MockSynonymFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() throws IOException {
  if (tokenQueue.size() > 0) {
    tokenQueue.remove(0).copyTo(this);
    return true;
  }
  if (endOfInput == false && input.incrementToken()) {
    if (termAtt.toString().equals("dogs")) {
      addSynonymAndRestoreOrigToken("dog", 1, offsetAtt.endOffset());
    } else if (termAtt.toString().equals("guinea")) {
      AttributeSource firstSavedToken = cloneAttributes();
      if (input.incrementToken()) {
        if (termAtt.toString().equals("pig")) {
          AttributeSource secondSavedToken = cloneAttributes();
          int secondEndOffset = offsetAtt.endOffset();
          firstSavedToken.copyTo(this);
          addSynonym("cavy", 2, secondEndOffset);
          tokenQueue.add(secondSavedToken);
        } else if (termAtt.toString().equals("dogs")) {
          tokenQueue.add(cloneAttributes());
          addSynonym("dog", 1, offsetAtt.endOffset());
        }
      } else {
        endOfInput = true;
      }
      firstSavedToken.copyTo(this);
    }
    return true;
  } else {
    endOfInput = true;
    return false;
  }
}

Source File: NumericTokenizer.java From Elasticsearch with Apache License 2.0

5 votes

/** Make this tokenizer get attributes from the delegate token stream. */
private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
    return new AttributeFactory() {
        @Override
        public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
            return (AttributeImpl) source.addAttribute(attClass);
        }
    };
}

Source File: FieldInvertState.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Sets attributeSource to a new instance.
 */
void setAttributeSource(AttributeSource attributeSource) {
  if (this.attributeSource != attributeSource) {
    this.attributeSource = attributeSource;
    termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
    termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
    posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
    payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
  }
}

Source File: GraphTokenStreamFiniteStrings.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Returns the list of tokens that start at the provided state
 */
public List<AttributeSource> getTerms(int state) {
  int numT = det.initTransition(state, transition);
  List<AttributeSource> tokens = new ArrayList<> ();
  for (int i = 0; i < numT; i++) {
    det.getNextTransition(transition);
    tokens.addAll(Arrays.asList(this.tokens).subList(transition.min, transition.max + 1));
  }
  return tokens;
}

Source File: LegacyNumericRangeQuery.java From lucene-solr with Apache License 2.0

5 votes

@Override @SuppressWarnings("unchecked")
protected TermsEnum getTermsEnum(final Terms terms, AttributeSource atts) throws IOException {
  // very strange: java.lang.Number itself is not Comparable, but all subclasses used here are
  if (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) {
    return TermsEnum.EMPTY;
  }
  return new NumericRangeTermsEnum(terms.iterator());
}

Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
 *
 * @param attributeSource source of the attribute factory and attribute impls
 * @param tokens Source of tokens to be used
 */
ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) {
  super(attributeSource.getAttributeFactory());
  this.tokens = tokens;
  // Make sure all the attributes of the source are here too
  addAttributes(attributeSource);
}

Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0

5 votes

@Override
public boolean incrementToken() {
  if (tokenIterator.hasNext()) {
    clearAttributes();
    AttributeSource next = tokenIterator.next();

    addAttributes(next); // just in case there were delayed attribute additions

    next.copyTo(this);
    return true;
  } else {
    return false;
  }
}

Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0

5 votes

protected void addAttributes(AttributeSource attributeSource) {
  // note: ideally we wouldn't call addAttributeImpl which is marked internal. But nonetheless it's possible
  //  this method is used by some custom attributes, especially since Solr doesn't provide a way to customize the
  //  AttributeFactory which is the recommended way to choose which classes implement which attributes.
  Iterator<AttributeImpl> atts = attributeSource.getAttributeImplsIterator();
  while (atts.hasNext()) {
    addAttributeImpl(atts.next()); // adds both impl & interfaces
  }
}

Source File: PreAnalyzedField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }
  
  AttributeSource.State state = it.next();
  restoreState(state.clone());
  // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
  lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
  return true;
}

Source File: Zemberek2DeASCIIfyFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0

5 votes

private boolean createToken(String synonym, AttributeSource.State current) {
    restoreState(current);
    termAttribute.setEmpty().append(synonym);
    typeAtt.setType(DEASCII_TOKEN_TYPE);
    posIncrAtt.setPositionIncrement(0);
    return true;
}

Source File: OpenNLPPOSTaggerFilter.java From jate with GNU Lesser General Public License v3.0

5 votes

@Override
public boolean incrementToken() throws IOException {
    //clearAttributes();
    if (first) {
        //gather all tokens from doc
        String[] words = walkTokens();
        if (words.length == 0) {
            return false;
        }
        //tagging
        posTags = createTags(words);
        first = false;
        tokenIdx = 0;
    }

    if (tokenIdx == tokenAttrs.size()) {
        resetParams();
        return false;
    }

    AttributeSource as = tokenAttrs.get(tokenIdx);
    Iterator<? extends Class<? extends Attribute>> it = as.getAttributeClassesIterator();
    while (it.hasNext()) {
        Class<? extends Attribute> attrClass = it.next();
        if (!hasAttribute(attrClass)) {
            addAttribute(attrClass);
        }
    }
    as.copyTo(this);
    MWEMetadata metadata = exitingPayload.getPayload() == null ? new MWEMetadata() :
            MWEMetadata.deserialize(exitingPayload.getPayload().utf8ToString());
    metadata.addMetaData(MWEMetadataType.POS, posTags[tokenIdx]);
    exitingPayload.setPayload(new BytesRef(MWEMetadata.serialize(metadata)));
    tokenIdx++;
    return true;
}

Source File: MtasPreAnalyzedParser.java From mtas with Apache License 2.0

4 votes

@Override
public ParseResult parse(Reader reader, AttributeSource parent)
    throws IOException {
  ParseResult res = new ParseResult();

  // get MtasUpdateRequestProcessorResult
  StringBuilder sb = new StringBuilder();
  char[] buf = new char[128];
  int cnt;
  while ((cnt = reader.read(buf)) > 0) {
    sb.append(buf, 0, cnt);
  }
  Iterator<MtasUpdateRequestProcessorResultItem> iterator;

  try (
      MtasUpdateRequestProcessorResultReader result = new MtasUpdateRequestProcessorResultReader(
          sb.toString());) {
    iterator = result.getIterator();
    if (iterator != null && iterator.hasNext()) {
      res.str = result.getStoredStringValue();
      res.bin = result.getStoredBinValue();
    } else {
      res.str = null;
      res.bin = null;
      result.close();
      return res;
    }
    parent.clearAttributes();
    while (iterator.hasNext()) {
      MtasUpdateRequestProcessorResultItem item = iterator.next();
      if (item.tokenTerm != null) {
        CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
        catt.append(item.tokenTerm);
      }
      if (item.tokenFlags != null) {
        FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
        flags.setFlags(item.tokenFlags);
      }
      if (item.tokenPosIncr != null) {
        PositionIncrementAttribute patt = parent
            .addAttribute(PositionIncrementAttribute.class);
        patt.setPositionIncrement(item.tokenPosIncr);
      }
      if (item.tokenPayload != null) {
        PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
        p.setPayload(new BytesRef(item.tokenPayload));
      }
      if (item.tokenOffsetStart != null && item.tokenOffsetEnd != null) {
        OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
        offset.setOffset(item.tokenOffsetStart, item.tokenOffsetEnd);
      }
      // capture state and add to result
      State state = parent.captureState();
      res.states.add(state.clone());
      // reset for reuse
      parent.clearAttributes();
    }
  } catch (IOException e) {
    // ignore
    log.debug(e);
  }
  return res;
}

Source File: SingleTokenTokenizer.java From attic-polygene-java with Apache License 2.0

4 votes

public SingleTokenTokenizer( AttributeSource source, Reader in )
{
   super( source, in );
}

Source File: STMergingTermsEnum.java From lucene-solr with Apache License 2.0

4 votes

@Override
public AttributeSource attributes() {
  throw new UnsupportedOperationException();
}

Source File: SolrRangeQuery.java From lucene-solr with Apache License 2.0

4 votes

@Override
public AttributeSource attributes() {
  return te.attributes();
}

org.apache.lucene.util.AttributeSource Java Examples