org.apache.lucene.util.AttributeFactory Java Exaples

Source File: TestConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

6 votes

public void testInconsistentAttributes() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    first.addAttribute(PayloadAttribute.class);
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    second.addAttribute(FlagsAttribute.class);

    TokenStream ts = new ConcatenatingTokenStream(first, second);
    assertTrue(ts.hasAttribute(FlagsAttribute.class));
    assertTrue(ts.hasAttribute(PayloadAttribute.class));

    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words" },
        new int[]{ 0, 6, 12, 19, },
        new int[]{ 5, 11, 18, 24, });

  }

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testHugeTerm2() throws Exception {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 40960; i++) {
        sb.append('a');
    }
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    char token[] = new char[4096];
    Arrays.fill(token, 'a');
    String expectedToken = new String(token);
    String expected[] = {
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken, expectedToken, expectedToken,
            expectedToken
    };
    assertTokenStreamContents(tokenizer, expected);
}

Source File: QueryParserImpl.java From AdSearch_Endpoints with Apache License 2.0

6 votes

@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }

Source File: MMSegTokenizerFactory.java From mmseg4j-solr with Apache License 2.0

5 votes

@Override
public Tokenizer create(AttributeFactory factory) {
	MMSegTokenizer tokenizer = tokenizerLocal.get();
	if(tokenizer == null) {
		tokenizer = newTokenizer();
	}

	return tokenizer;
}

Source File: NumericTokenizer.java From Elasticsearch with Apache License 2.0

5 votes

/** Make this tokenizer get attributes from the delegate token stream. */
private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
    return new AttributeFactory() {
        @Override
        public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
            return (AttributeImpl) source.addAttribute(attClass);
        }
    };
}

Source File: PathHierarchyTokenizerFactory.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Tokenizer create(AttributeFactory factory) {
  if (reverse) {
    return new ReversePathHierarchyTokenizer(factory, delimiter, replacement, skip);
  }
  return new PathHierarchyTokenizer(factory, delimiter, replacement, skip);
}

Source File: IcuTokenizerCJKTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer create() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(true, true)));
        }
    };
}

Source File: OpenNLPTokenizer.java From jate with GNU Lesser General Public License v3.0

5 votes

public OpenNLPTokenizer(AttributeFactory factory, SentenceDetector sentenceOp,
                        opennlp.tools.tokenize.Tokenizer tokenizerOp,
                        ParagraphChunker paragraphOp) {
    super(factory);
    termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE);
    if (sentenceOp == null && tokenizerOp == null) {
        throw new IllegalArgumentException("OpenNLPTokenizer: need one or both of Sentence Detector and Tokenizer");
    }
    this.sentenceOp = sentenceOp;
    this.tokenizerOp = tokenizerOp;
    this.paragraphOp = paragraphOp;
}

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}

Source File: MtasTokenizerFactory.java From mtas with Apache License 2.0

5 votes

@Override
public MtasTokenizer create(AttributeFactory factory) {
  MtasTokenizer tokenizer = null;
  try {
    tokenizer = create(factory, null);
  } catch (IOException e) {
    log.error(e);
  }
  return tokenizer;
}

Source File: JapaneseTokenizer.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
 *
 * @param factory the AttributeFactory to use
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer
    (AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  this(factory,
       TokenInfoDictionary.getInstance(),
       UnknownDictionary.getInstance(),
       ConnectionCosts.getInstance(),
       userDictionary, discardPunctuation, true, mode);
}

Source File: MMSegTokenizerFactory.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public Tokenizer create(AttributeFactory factory) {
    MmsegTokenizer tokenizer = tokenizerLocal.get();
    if (tokenizer == null) {
        tokenizer = newTokenizer();
    }

    return tokenizer;
}

Source File: JapaneseTokenizerFactory.java From lucene-solr with Apache License 2.0

5 votes

@Override
public JapaneseTokenizer create(AttributeFactory factory) {
  JapaneseTokenizer t = new JapaneseTokenizer(factory, userDictionary, discardPunctuation, discardCompoundToken, mode);
  if (nbestExamples != null) {
    nbestCost = Math.max(nbestCost, t.calcNBestCost(nbestExamples));
  }
  t.setNBestCost(nbestCost);
  return t;
}

Source File: OpenNLPTokenizerFactory.java From lucene-solr with Apache License 2.0

5 votes

@Override
public OpenNLPTokenizer create(AttributeFactory factory) {
  try {
    NLPSentenceDetectorOp sentenceOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
    NLPTokenizerOp tokenizerOp = OpenNLPOpsFactory.getTokenizer(tokenizerModelFile);
    return new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}

Source File: WhitespaceTokenizerFactory.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Tokenizer create(AttributeFactory factory) {
  switch (rule) {
    case RULE_JAVA:
      return new WhitespaceTokenizer(factory, maxTokenLen);
    case RULE_UNICODE:
      return new UnicodeWhitespaceTokenizer(factory, maxTokenLen);
    default:
      throw new AssertionError();
  }
}

Source File: ThaiTokenizer.java From lucene-solr with Apache License 2.0

5 votes

/** Creates a new ThaiTokenizer, supplying the AttributeFactory */
public ThaiTokenizer(AttributeFactory factory) {
  super(factory, (BreakIterator)sentenceProto.clone());
  if (!DBBI_AVAILABLE) {
    throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
  }
  wordBreaker = (BreakIterator)proto.clone();
}

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testHugeDoc() throws Exception {
    StringBuilder sb = new StringBuilder();
    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    IcuTokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
            new DefaultIcuTokenizerConfig(false, true));
    tokenizer.setReader(new StringReader(input));
    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}

Source File: SimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

/** Runs a pre-built automaton. */
public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}

Source File: Test2BTerms.java From lucene-solr with Apache License 2.0

5 votes

public MyTokenStream(Random random, int tokensPerDoc) {
  super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
  this.tokensPerDoc = tokensPerDoc;
  addAttribute(TermToBytesRefAttribute.class);
  bytes.length = TOKEN_LEN;
  this.random = random;
  nextSave = TestUtil.nextInt(random, 500000, 1000000);
}

Source File: TestConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

5 votes

public void testBasic() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    third.setReader(new StringReader(" third words"));

    TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 });

    // test re-use
    first.setReader(new StringReader("first words "));
    second.setReader(new StringReader("second words"));
    third.setReader(new StringReader(" third words"));
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 },
        new int[]{ 1, 1, 1, 1, 1, 1 });

  }

Source File: SimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

/** Runs a pre-built automaton. */
public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}

Source File: KoreanTokenizer.java From lucene-solr with Apache License 2.0

5 votes

/**
 * <p>Create a new KoreanTokenizer supplying a custom system dictionary and unknown dictionary.
 * This constructor provides an entry point for users that want to construct custom language models
 * that can be used as input to {@link org.apache.lucene.analysis.ko.util.DictionaryBuilder}.</p>
 *
 * @param factory the AttributeFactory to use
 * @param systemDictionary a custom known token dictionary
 * @param unkDictionary a custom unknown token dictionary
 * @param connectionCosts custom token transition costs
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param mode Decompound mode.
 * @param outputUnknownUnigrams if true outputs unigrams for unknown words.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @lucene.experimental
 */
public KoreanTokenizer(AttributeFactory factory,
                       TokenInfoDictionary systemDictionary,
                       UnknownDictionary unkDictionary,
                       ConnectionCosts connectionCosts,
                       UserDictionary userDictionary,
                       DecompoundMode mode,
                       boolean outputUnknownUnigrams,
                       boolean discardPunctuation) {
  super(factory);
  this.dictionary = systemDictionary;
  this.fst = dictionary.getFST();
  this.unkDictionary = unkDictionary;
  this.characterDefinition = unkDictionary.getCharacterDefinition();
  this.costs = connectionCosts;
  this.userDictionary = userDictionary;
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.mode = mode;
  this.outputUnknownUnigrams = outputUnknownUnigrams;
  this.discardPunctuation = discardPunctuation;
  buffer.reset(this.input);

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}

Source File: CJKBigramFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

@Before
public void up() {
    analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenStream result = new CJKBigramFilter(source);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
}

Source File: OpenNLPTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
  super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
  if (sentenceOp == null || tokenizerOp == null) {
    throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
  }
  this.sentenceOp = sentenceOp;
  this.tokenizerOp = tokenizerOp;
}

Source File: NGramTokenizer.java From lucene-solr with Apache License 2.0

4 votes

NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) {
  super(factory);
  init(minGram, maxGram, edgesOnly);
}

Source File: ClassicTokenizer.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory} 
 */
public ClassicTokenizer(AttributeFactory factory) {
  super(factory);
  init();
}

Source File: FieldAnalysisRequestHandlerTest.java From lucene-solr with Apache License 2.0

4 votes

public CustomTokenizer(AttributeFactory factory) {
  super(factory);
  addAttributeImpl(new CustomFlagsAttributeImpl());
  charAtt = addAttribute(CharTermAttribute.class);
  customAtt = addAttribute(FlagsAttribute.class);
}

Source File: ExactTokenizerFactory.java From crushpaper with GNU Affero General Public License v3.0

4 votes

@Override
public Tokenizer create(AttributeFactory factory, Reader input) {
	return new ExactTokenizer(factory, input);
}

Source File: WikipediaTokenizerFactory.java From lucene-solr with Apache License 2.0

4 votes

@Override
public WikipediaTokenizer create(AttributeFactory factory) {
  return new WikipediaTokenizer(factory, tokenOutput, untokenizedTypes);
}

Source File: MockBytesAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected AttributeFactory attributeFactory(String fieldName) {
  return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
}

org.apache.lucene.util.AttributeFactory Java Examples