org.apache.lucene.util.AttributeSource#State

Source File: TestTermAutomatonQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
public boolean incrementToken() throws IOException {
  if (synNext) {
    AttributeSource.State state = captureState();
    clearAttributes();
    restoreState(state);
    posIncAtt.setPositionIncrement(0);
    termAtt.append(""+((char) 97 + random().nextInt(3)));
    synNext = false;
    return true;
  }

  if (input.incrementToken()) {
    if (random().nextInt(10) == 8) {
      synNext = true;
    }
    return true;
  } else {
    return false;
  }
}

Source File: TeeSinkTokenFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }

  AttributeSource.State state = it.next();
  restoreState(state);
  return true;
}

Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() throws IOException {
  if (tokens != null && tokens.hasNext()){
    AttributeSource.State state = tokens.next();
    restoreState(state);
    return true;
  }
  clearAttributes();
  int tokenType = scanner.getNextToken();

  if (tokenType == WikipediaTokenizerImpl.YYEOF) {
    return false;
  }
  String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
  if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
    setupToken();
  } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
    collapseTokens(tokenType);

  }
  else if (tokenOutput == BOTH){
    //collapse into a single token, add it to tokens AND output the individual tokens
    //output the untokenized Token first
    collapseAndSaveTokens(tokenType, type);
  }
  int posinc = scanner.getPositionIncrement();
  if (first && posinc == 0) {
    posinc = 1; // don't emit posinc=0 for the first token!
  }
  posIncrAtt.setPositionIncrement(posinc);
  typeAtt.setType(type);
  first = false;
  return true;
}

Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0

5 votes

private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
  //collapse
  StringBuilder buffer = new StringBuilder(32);
  int numAdded = scanner.setText(buffer);
  //TODO: how to know how much whitespace to add
  int theStart = scanner.yychar();
  int lastPos = theStart + numAdded;
  int tmpTokType;
  int numSeen = 0;
  List<AttributeSource.State> tmp = new ArrayList<>();
  setupSavedToken(0, type);
  tmp.add(captureState());
  //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
  while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
    int currPos = scanner.yychar();
    //append whitespace
    for (int i = 0; i < (currPos - lastPos); i++){
      buffer.append(' ');
    }
    numAdded = scanner.setText(buffer);
    setupSavedToken(scanner.getPositionIncrement(), type);
    tmp.add(captureState());
    numSeen++;
    lastPos = currPos + numAdded;
  }
  //trim the buffer
  // TODO: this is inefficient
  String s = buffer.toString().trim();
  termAtt.setEmpty().append(s);
  offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
  flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
  //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
  if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
    scanner.yypushback(scanner.yylength());
  }
  tokens = tmp.iterator();
}

Source File: WordDelimiterFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected void swap(int i, int j) {
  AttributeSource.State tmp = buffered[i];
  buffered[i] = buffered[j];
  buffered[j] = tmp;
  
  int tmp2 = startOff[i];
  startOff[i] = startOff[j];
  startOff[j] = tmp2;
  
  tmp2 = posInc[i];
  posInc[i] = posInc[j];
  posInc[j] = tmp2;
}

Source File: PreAnalyzedField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }
  
  AttributeSource.State state = it.next();
  restoreState(state.clone());
  // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
  lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
  return true;
}

Source File: Zemberek2DeASCIIfyFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0

5 votes

private boolean createToken(String synonym, AttributeSource.State current) {
    restoreState(current);
    termAttribute.setEmpty().append(synonym);
    typeAtt.setType(DEASCII_TOKEN_TYPE);
    posIncrAtt.setPositionIncrement(0);
    return true;
}

Source File: LookaheadTokenFilter.java From lucene-solr with Apache License 2.0

4 votes

public void add(AttributeSource.State state) {
  inputTokens.add(state);
}

Source File: LookaheadTokenFilter.java From lucene-solr with Apache License 2.0

4 votes

public AttributeSource.State nextState() {
  assert nextRead < inputTokens.size();
  return inputTokens.get(nextRead++);
}

Source File: TestDocumentWriter.java From lucene-solr with Apache License 2.0

4 votes

public void testTokenReuse() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {
        boolean first = true;
        AttributeSource.State state;

        @Override
        public boolean incrementToken() throws IOException {
          if (state != null) {
            restoreState(state);
            payloadAtt.setPayload(null);
            posIncrAtt.setPositionIncrement(0);
            termAtt.setEmpty().append("b");
            state = null;
            return true;
          }

          boolean hasNext = input.incrementToken();
          if (!hasNext) return false;
          if (Character.isDigit(termAtt.buffer()[0])) {
            posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
          }
          if (first) {
            // set payload on first position only
            payloadAtt.setPayload(new BytesRef(new byte[]{100}));
            first = false;
          }

          // index a "synonym" for every token
          state = captureState();
          return true;

        }

        @Override
        public void reset() throws IOException {
          super.reset();
          first = true;
          state = null;
        }

        final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
        final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
      });
    }
  };

  IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));

  Document doc = new Document();
  doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));

  writer.addDocument(doc);
  writer.commit();
  SegmentCommitInfo info = writer.newestSegment();
  writer.close();
  SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));

  PostingsEnum termPositions = MultiTerms.getTermPostingsEnum(reader, "f1", new BytesRef("a"));
  assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  int freq = termPositions.freq();
  assertEquals(3, freq);
  assertEquals(0, termPositions.nextPosition());
  assertNotNull(termPositions.getPayload());
  assertEquals(6, termPositions.nextPosition());
  assertNull(termPositions.getPayload());
  assertEquals(7, termPositions.nextPosition());
  assertNull(termPositions.getPayload());
  reader.close();
}

Source File: SimplePreAnalyzedParser.java From lucene-solr with Apache License 2.0

4 votes

private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
  a.clearAttributes();
  CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
  char[] tokChars = state.token.toString().toCharArray();
  termAtt.copyBuffer(tokChars, 0, tokChars.length);
  int tokenStart = tokenEnd - state.token.length();
  for (Entry<String, String> e : state.attr.entrySet()) {
    String k = e.getKey();
    if (k.equals("i")) {
      // position increment
      int incr = Integer.parseInt(e.getValue());
      PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
      posIncr.setPositionIncrement(incr);
    } else if (k.equals("s")) {
      tokenStart = Integer.parseInt(e.getValue());
    } else if (k.equals("e")) {
      tokenEnd = Integer.parseInt(e.getValue());
    } else if (k.equals("y")) {
      TypeAttribute type = a.addAttribute(TypeAttribute.class);
      type.setType(e.getValue());
    } else if (k.equals("f")) {
      FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
      int f = Integer.parseInt(e.getValue(), 16);
      flags.setFlags(f);
    } else if (k.equals("p")) {
      PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
      byte[] data = hexToBytes(e.getValue());
      if (data != null && data.length > 0) {
        p.setPayload(new BytesRef(data));
      }
    } else {
      // unknown attribute
    }
  }
  // handle offset attr
  OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
  offset.setOffset(tokenStart, tokenEnd);
  State resState = a.captureState();
  a.clearAttributes();
  return resState;
}

Java Code Examples for org.apache.lucene.util.AttributeSource#State