Java Code Examples for org.apache.lucene.index.IndexableField#tokenStream()

The following examples show how to use org.apache.lucene.index.IndexableField#tokenStream() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SingleDocumentPercolatorIndex.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public void prepare(PercolateContext context, ParsedDocument parsedDocument) {
    MemoryIndex memoryIndex = cache.get();
    for (IndexableField field : parsedDocument.rootDoc().getFields()) {
        if (field.fieldType().indexOptions() == IndexOptions.NONE && field.name().equals(UidFieldMapper.NAME)) {
            continue;
        }
        try {
            Analyzer analyzer = context.mapperService().documentMapper(parsedDocument.type()).mappers().indexAnalyzer();
            // TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous,
            // like the indexer does
            try (TokenStream tokenStream = field.tokenStream(analyzer, null)) {
                if (tokenStream != null) {
                    memoryIndex.addField(field.name(), tokenStream, field.boost());
                }
             }
        } catch (Exception e) {
            throw new ElasticsearchException("Failed to create token stream for [" + field.name() + "]", e);
        }
    }
    context.initialize(new DocEngineSearcher(memoryIndex), parsedDocument);
}

Example 2

Source File: ReadTokensTask.java From lucene-solr with Apache License 2.0

6 votes

@Override
public int doLogic() throws Exception {
  List<IndexableField> fields = doc.getFields();
  Analyzer analyzer = getRunData().getAnalyzer();
  int tokenCount = 0;
  for(final IndexableField field : fields) {
    if (field.fieldType().indexOptions() == IndexOptions.NONE ||
        field.fieldType().tokenized() == false) {
      continue;
    }
    
    final TokenStream stream = field.tokenStream(analyzer, null);
    // reset the TokenStream to the first token
    stream.reset();

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    while(stream.incrementToken()) {
      termAtt.getBytesRef();
      tokenCount++;
    }
    stream.end();
    stream.close();
  }
  totalTokenCount += tokenCount;
  return tokenCount;
}

Example 3

Source File: SimpleNaiveBayesDocumentClassifier.java From lucene-solr with Apache License 2.0

6 votes

/**
 * This methods performs the analysis for the seed document and extract the boosts if present.
 * This is done only one time for the Seed Document.
 *
 * @param inputDocument         the seed unseen document
 * @param fieldName2tokensArray a map that associated to a field name the list of token arrays for all its values
 * @param fieldName2boost       a map that associates the boost to the field
 * @throws IOException If there is a low-level I/O error
 */
private void analyzeSeedDocument(Document inputDocument, Map<String, List<String[]>> fieldName2tokensArray, Map<String, Float> fieldName2boost) throws IOException {
  for (int i = 0; i < textFieldNames.length; i++) {
    String fieldName = textFieldNames[i];
    float boost = 1;
    List<String[]> tokenizedValues = new LinkedList<>();
    if (fieldName.contains("^")) {
      String[] field2boost = fieldName.split("\\^");
      fieldName = field2boost[0];
      boost = Float.parseFloat(field2boost[1]);
    }
    IndexableField[] fieldValues = inputDocument.getFields(fieldName);
    for (IndexableField fieldValue : fieldValues) {
      TokenStream fieldTokens = fieldValue.tokenStream(field2analyzer.get(fieldName), null);
      String[] fieldTokensArray = getTokenArray(fieldTokens);
      tokenizedValues.add(fieldTokensArray);
    }
    fieldName2tokensArray.put(fieldName, tokenizedValues);
    fieldName2boost.put(fieldName, boost);
    textFieldNames[i] = fieldName;
  }
}