Java Code Examples for org.apache.lucene.analysis.MockTokenizer#setReader()

The following examples show how to use org.apache.lucene.analysis.MockTokenizer#setReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testOutputHangsOffEnd() throws Exception {
  b = new SynonymMap.Builder(true);
  final boolean keepOrig = false;
  // b hangs off the end (no input token under it):
  add("a", "a b", keepOrig);
  tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  tokensIn.setReader(new StringReader("a"));
  tokensIn.reset();
  assertTrue(tokensIn.incrementToken());
  assertFalse(tokensIn.incrementToken());
  tokensIn.end();
  tokensIn.close();

  tokensOut = new SynonymFilter(tokensIn,
                                b.build(),
                                true);
  termAtt = tokensOut.addAttribute(CharTermAttribute.class);
  posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
  posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);

  // Make sure endOffset inherits from previous input token:
  verify("a", "a b:1");
}
 
Example 2
Source File: TestTeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMultipleSources() throws Exception {
  final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
  final TokenStream source1 = new CachingTokenFilter(tee1);

  tee1.addAttribute(CheckClearAttributesAttribute.class);

  MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader(buffer2.toString()));
  final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer);
  final TokenStream source2 = tee2;

  assertTokenStreamContents(source1, tokens1);
  assertTokenStreamContents(source2, tokens2);

  TokenStream lowerCasing = new LowerCaseFilter(source1);
  String[] lowerCaseTokens = new String[tokens1.length];
  for (int i = 0; i < tokens1.length; i++)
    lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT);
  assertTokenStreamContents(lowerCasing, lowerCaseTokens);
}
 
Example 3
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wsTokenizer.setEnableChecks(false); // we will reset in a strange place
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  
  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  tf.end();
  tf.close();
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
 
Example 4
Source File: TestCzechStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("hole");
  final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  in.setReader(new StringReader("hole desek"));
  CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
      in, set));
  assertTokenStreamContents(filter, new String[] { "hole", "desk" });
}
 
Example 5
Source File: TestLimitTokenCountFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    Reader reader = new StringReader("A1 B2 C3 D4 E5 F6");
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(reader);
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("LimitTokenCount",
        LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY, "3",
        LimitTokenCountFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll)
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"});
  }
}
 
Example 6
Source File: TestConcatenateGraphFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    final String input = "A1 B2 A1 D4 C3";
    Reader reader = new StringReader(input);
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(reader);
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("ConcatenateGraph",
        "tokenSeparator", "\u001F"
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
  }
}
 
Example 7
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testValidNumberOfExpansions() throws IOException {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  for (int i = 0; i < 256; i++) {
    builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true);
  }
  StringBuilder valueBuilder = new StringBuilder();
  for (int i = 0 ; i < 8 ; i++) {
    valueBuilder.append(i+1);
    valueBuilder.append(" ");
  }
  MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  tokenizer.setReader(new StringReader(valueBuilder.toString()));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

  int count;
  try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) {
    stream.reset();
    ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class);
    count = 0;
    while (stream.incrementToken()) {
      count++;
      assertNotNull(attr.getBytesRef());
      assertTrue(attr.getBytesRef().length > 0);
    }
  }
  assertEquals(count, 256);
}
 
Example 8
Source File: TestCapitalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void assertCapitalizesToKeyword(String input, String expected,
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  final MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
  tokenizer.setReader(new StringReader(input));
  assertCapitalizesTo(tokenizer,
      new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
      minWordLength, maxWordCount, maxTokenLength);    
}
 
Example 9
Source File: TestCapitalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void assertCapitalizesTo(String input, String expected[],
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  final MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader(input));
  assertCapitalizesTo(tokenizer,
      expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength,
      maxWordCount, maxTokenLength);
}
 
Example 10
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter when first and last words are stopwords.
 */
public void TestFirstAndLastStopWord() throws Exception {
  final String input = "the of";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_of" });
}
 
Example 11
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single word query
 */
public void testOneWordQuery() throws Exception {
  final String input = "monster";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "monster" });
}
 
Example 12
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single (stop)word query
 */
public void testOneWordQueryStopWord() throws Exception {
  final String input = "the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the" });
}
 
Example 13
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case that the first word is a stopword
 */
public void testFirstWordisStopWord() throws Exception {
  final String input = "the dog";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_dog" });
}
 
Example 14
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test that CommonGramsFilter works correctly in case-insensitive mode
 */
public void testCaseSensitive() throws Exception {
  final String input = "How The s a brown s cow d like A B thing?";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  TokenFilter cgf = new CommonGramsFilter(wt, commonWords);
  assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
      "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
      "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
}
 
Example 15
Source File: TestBulgarianStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("строеве");
  MockTokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(new StringReader("строевете строеве"));

  BulgarianStemFilter filter = new BulgarianStemFilter(
      new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
}
 
Example 16
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testBasic2() throws Exception {
  b = new SynonymMap.Builder(true);
  final boolean keepOrig = false;
  add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
  add("bbb", "bbbb1 bbbb2", keepOrig);
  tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE,
                               true);
  tokensIn.setReader(new StringReader("a"));
  tokensIn.reset();
  assertTrue(tokensIn.incrementToken());
  assertFalse(tokensIn.incrementToken());
  tokensIn.end();
  tokensIn.close();

  tokensOut = new SynonymFilter(tokensIn,
                                   b.build(),
                                   true);
  termAtt = tokensOut.addAttribute(CharTermAttribute.class);
  posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
  posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
  offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);

  if (keepOrig) {
    verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
    verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
  } else {
    verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
    verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
  }
}
 
Example 17
Source File: SolrTestCaseJ4.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
protected static MockTokenizer whitespaceMockTokenizer(Reader input) throws IOException {
  MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  mockTokenizer.setReader(input);
  return mockTokenizer;
}
 
Example 18
Source File: TestIndexWriter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testIndexStoreCombos() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  byte[] b = new byte[50];
  for(int i=0;i<50;i++)
    b[i] = (byte) (i+77);

  Document doc = new Document();

  FieldType customType = new FieldType(StoredField.TYPE);
  customType.setTokenized(true);

  Field f = new Field("binary", b, 10, 17, customType);
  // TODO: this is evil, changing the type after creating the field:
  customType.setIndexOptions(IndexOptions.DOCS);
  final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc1field1.setReader(new StringReader("doc1field1"));
  f.setTokenStream(doc1field1);

  FieldType customType2 = new FieldType(TextField.TYPE_STORED);

  Field f2 = newField("string", "value", customType2);
  final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc1field2.setReader(new StringReader("doc1field2"));
  f2.setTokenStream(doc1field2);
  doc.add(f);
  doc.add(f2);
  w.addDocument(doc);

  // add 2 docs to test in-memory merging
  final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc2field1.setReader(new StringReader("doc2field1"));
  f.setTokenStream(doc2field1);
  final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc2field2.setReader(new StringReader("doc2field2"));
  f2.setTokenStream(doc2field2);
  w.addDocument(doc);

  // force segment flush so we can force a segment merge with doc3 later.
  w.commit();

  final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc3field1.setReader(new StringReader("doc3field1"));
  f.setTokenStream(doc3field1);
  final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc3field2.setReader(new StringReader("doc3field2"));
  f2.setTokenStream(doc3field2);

  w.addDocument(doc);
  w.commit();
  w.forceMerge(1);   // force segment merge.
  w.close();

  IndexReader ir = DirectoryReader.open(dir);
  Document doc2 = ir.document(0);
  IndexableField f3 = doc2.getField("binary");
  b = f3.binaryValue().bytes;
  assertTrue(b != null);
  assertEquals(17, b.length, 17);
  assertEquals(87, b[0]);

  assertTrue(ir.document(0).getField("binary").binaryValue()!=null);
  assertTrue(ir.document(1).getField("binary").binaryValue()!=null);
  assertTrue(ir.document(2).getField("binary").binaryValue()!=null);

  assertEquals("value", ir.document(0).get("string"));
  assertEquals("value", ir.document(1).get("string"));
  assertEquals("value", ir.document(2).get("string"));


  // test that the terms were indexed.
  assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc1field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc2field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc3field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc1field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc2field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc3field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

  ir.close();
  dir.close();

}
 
Example 19
Source File: SolrTestCaseJ4.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
protected static MockTokenizer whitespaceMockTokenizer(String input) throws IOException {
  MockTokenizer mockTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  mockTokenizer.setReader(new StringReader(input));
  return mockTokenizer;
}
 
Example 20
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testRandom() throws Exception {
  
  final int alphabetSize = TestUtil.nextInt(random(), 2, 7);

  final int docLen = atLeast(3000);
  //final int docLen = 50;

  final String document = getRandomString('a', alphabetSize, docLen);

  if (VERBOSE) {
    System.out.println("TEST: doc=" + document);
  }

  final int numSyn = atLeast(5);
  //final int numSyn = 2;

  final Map<String,OneSyn> synMap = new HashMap<>();
  final List<OneSyn> syns = new ArrayList<>();
  final boolean dedup = random().nextBoolean();
  if (VERBOSE) {
    System.out.println("  dedup=" + dedup);
  }
  b = new SynonymMap.Builder(dedup);
  for(int synIDX=0;synIDX<numSyn;synIDX++) {
    final String synIn = getRandomString('a', alphabetSize, TestUtil.nextInt(random(), 1, 5)).trim();
    OneSyn s = synMap.get(synIn);
    if (s == null) {
      s = new OneSyn();
      s.in = synIn;
      syns.add(s);
      s.out = new ArrayList<>();
      synMap.put(synIn, s);
      s.keepOrig = random().nextBoolean();
    }
    final String synOut = getRandomString('0', 10, TestUtil.nextInt(random(), 1, 5)).trim();
    s.out.add(synOut);
    add(synIn, synOut, s.keepOrig);
    if (VERBOSE) {
      System.out.println("  syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
    }
  }

  tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE,
                               true);
  tokensIn.setReader(new StringReader("a"));
  tokensIn.reset();
  assertTrue(tokensIn.incrementToken());
  assertFalse(tokensIn.incrementToken());
  tokensIn.end();
  tokensIn.close();

  tokensOut = new SynonymFilter(tokensIn,
                                   b.build(),
                                   true);
  termAtt = tokensOut.addAttribute(CharTermAttribute.class);
  posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
  posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
  offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);

  if (dedup) {
    pruneDups(syns);
  }

  final String expected = slowSynMatcher(document, syns, 5);

  if (VERBOSE) {
    System.out.println("TEST: expected=" + expected);
  }

  verify(document, expected);
}