Java Code Examples for org.apache.lucene.analysis.MockTokenizer#SIMPLE

The following examples show how to use org.apache.lucene.analysis.MockTokenizer#SIMPLE . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestDirectSpellChecker.java    From lucene-solr with Apache License 2.0 7 votes vote down vote up
public void testTransposition2() throws Exception {
  DirectSpellChecker spellChecker = new DirectSpellChecker();
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);

  for (int i = 0; i < 20; i++) {
    Document doc = new Document();
    doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
    writer.addDocument(doc);
  }

  IndexReader ir = writer.getReader();

  SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
      "numbers", "seevntene"), 2, ir,
      SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
  assertEquals(1, similar.length);
  assertEquals("seventeen", similar[0].string);
  
  IOUtils.close(ir, writer, dir, analyzer);
}
 
Example 2
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testDoKeepOrig() throws Exception {
  b = new SynonymMap.Builder(true);
  add("a b", "foo", true);

  final SynonymMap map = b.build();

  final Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
      return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
    }
  };

  assertAnalyzesTo(analyzer, "a b c",
                   new String[] {"a", "foo", "b", "c"},
                   new int[] {0, 0, 2, 4},
                   new int[] {1, 3, 3, 5},
                   null,
                   new int[] {1, 0, 1, 1},
                   new int[] {1, 2, 1, 1},
                   true);
  checkAnalysisConsistency(random(), analyzer, false, "a b c");
  analyzer.close();
}
 
Example 3
Source File: TestUniqueTermCount.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  dir = newDirectory();
  MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  IndexWriterConfig config = newIndexWriterConfig(analyzer);
  config.setMergePolicy(newLogMergePolicy());
  config.setSimilarity(new TestSimilarity());
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
  Document doc = new Document();
  Field foo = newTextField("foo", "", Field.Store.NO);
  doc.add(foo);
  for (int i = 0; i < 100; i++) {
    foo.setStringValue(addValue());
    writer.addDocument(doc);
  }
  reader = writer.getReader();
  writer.close();
}
 
Example 4
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testDontKeepOrig() throws Exception {
  b = new SynonymMap.Builder(true);
  add("a b", "foo", false);

  final SynonymMap map = b.build();

  final Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
      return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
    }
  };

  assertAnalyzesTo(analyzer, "a b c",
                   new String[] {"foo", "c"},
                   new int[] {0, 4},
                   new int[] {3, 5},
                   null,
                   new int[] {1, 1},
                   new int[] {1, 1},
                   true);
  checkAnalysisConsistency(random(), analyzer, false, "a b c");
  analyzer.close();
}
 
Example 5
Source File: HighlighterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String arg0) {
  Tokenizer stream = new MockTokenizer(MockTokenizer.SIMPLE, true);
  stream.addAttribute(CharTermAttribute.class);
  stream.addAttribute(PositionIncrementAttribute.class);
  stream.addAttribute(OffsetAttribute.class);
  return new TokenStreamComponents(stream, new SynonymTokenizer(stream, synonyms));
}
 
Example 6
Source File: TestExtendableQueryParser.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public QueryParser getParser(Analyzer a, Extensions extensions)
    throws Exception {
  if (a == null)
    a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  QueryParser qp = extensions == null ? new ExtendableQueryParser(
      getDefaultField(), a) : new ExtendableQueryParser(
      getDefaultField(), a, extensions);
  qp.setDefaultOperator(QueryParserBase.OR_OPERATOR);
  qp.setSplitOnWhitespace(splitOnWhitespace);
  return qp;
}
 
Example 7
Source File: TestSpanFirstQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStartPositions() throws Exception {
  Directory dir = newDirectory();
  
  // mimic StopAnalyzer
  CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
  
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
  Document doc = new Document();
  doc.add(newTextField("field", "the quick brown fox", Field.Store.NO));
  writer.addDocument(doc);
  Document doc2 = new Document();
  doc2.add(newTextField("field", "quick brown fox", Field.Store.NO));
  writer.addDocument(doc2);
  
  IndexReader reader = writer.getReader();
  IndexSearcher searcher = newSearcher(reader);
  
  // user queries on "starts-with quick"
  SpanQuery sfq = spanFirstQuery(spanTermQuery("field", "quick"), 1);
  assertEquals(1, searcher.search(sfq, 10).totalHits.value);
  
  // user queries on "starts-with the quick"
  SpanQuery include = spanFirstQuery(spanTermQuery("field", "quick"), 2);
  sfq = spanNotQuery(include, sfq);
  assertEquals(1, searcher.search(sfq, 10).totalHits.value);
  
  writer.close();
  reader.close();
  dir.close();
}
 
Example 8
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSeparator() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.SIMPLE, true);
  String input = "...mykeyword.another.keyword.";
  tokenStream.setReader(new StringReader(input));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, ' ', false, 100); //not \u001F
  assertTokenStreamContents(stream, new String[] {"mykeyword another keyword"}, null, null, new int[] { 1 });
}
 
Example 9
Source File: TestSynonymGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testRandomGraphAfter() throws Exception {
  final int numIters = atLeast(3);
  for (int i = 0; i < numIters; i++) {
    SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random().nextBoolean();
    final boolean doFlatten = random().nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
        TokenStream syns = new SynonymGraphFilter(tokenizer, map, ignoreCase);
        TokenStream graph = new MockGraphTokenFilter(random(), syns);
        if (doFlatten) {
          graph = new FlattenGraphFilter(graph);
        }
        return new TokenStreamComponents(tokenizer, graph);
      }
    };

    checkRandomData(random(), analyzer, 100);
    analyzer.close();
  }
}
 
Example 10
Source File: TestPortugueseStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.SIMPLE, true);
      return new TokenStreamComponents(source, new PortugueseStemFilter(source));
    }
  };
}
 
Example 11
Source File: TestQPHelper.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public Query getQueryDOA(String query, Analyzer a) throws Exception {
  if (a == null)
    a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  StandardQueryParser qp = new StandardQueryParser();
  qp.setAnalyzer(a);
  qp.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);

  return qp.parse(query, "field");

}
 
Example 12
Source File: QueryParserTestBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public Query getQueryDOA(String query, Analyzer a)
  throws Exception {
  if (a == null)
    a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  CommonQueryParserConfiguration qp = getParserConfig(a);
  setDefaultOperatorAND(qp);
  return getQuery(query, qp);
}
 
Example 13
Source File: TestUnifiedHighlighterStrictPhrases.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Before
public void doBefore() throws IOException {
  indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase
  indexAnalyzer.setPositionIncrementGap(3);// more than default
  dir = newDirectory();
  indexWriter = new RandomIndexWriter(random(), dir, indexAnalyzer);
}
 
Example 14
Source File: TestQPHelper.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public StandardQueryParser getParser(Analyzer a) throws Exception {
  if (a == null)
    a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  StandardQueryParser qp = new StandardQueryParser();
  qp.setAnalyzer(a);

  qp.setDefaultOperator(StandardQueryConfigHandler.Operator.OR);

  return qp;

}
 
Example 15
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** simple random test like testRandom2, but for larger docs
 */
public void testRandomHuge() throws Exception {
  Random random = random();
  final int numIters = atLeast(3);
  for (int i = 0; i < numIters; i++) {
    b = new SynonymMap.Builder(random.nextBoolean());
    final int numEntries = atLeast(10);
    if (VERBOSE) {
      System.out.println("TEST: iter=" + i + " numEntries=" + numEntries);
    }
    for (int j = 0; j < numEntries; j++) {
      add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random.nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
      }
    };

    checkRandomData(random, analyzer, 100, 1024);
    analyzer.close();
  }
}
 
Example 16
Source File: TestPayloadFilteredInterval.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testPayloadFilteredInterval() throws Exception {

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tok = new MockTokenizer(MockTokenizer.SIMPLE, true);
        return new TokenStreamComponents(tok, new SimplePayloadFilter(tok));
      }
    };

    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
        newIndexWriterConfig(analyzer)
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));

    Document doc = new Document();
    doc.add(newTextField("field", "a sentence with words repeated words words quite often words", Field.Store.NO));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();

    // SimplePayloadFilter stores a payload for each term at position n containing
    // the bytes 'pos:n'

    IntervalsSource source = Intervals.term("words", b -> b.utf8ToString().endsWith("5") == false);
    assertEquals("PAYLOAD_FILTERED(words)", source.toString());

    IntervalIterator it = source.intervals("field", reader.leaves().get(0));

    assertEquals(0, it.nextDoc());
    assertEquals(3, it.nextInterval());
    assertEquals(6, it.nextInterval());
    assertEquals(9, it.nextInterval());
    assertEquals(IntervalIterator.NO_MORE_INTERVALS, it.nextInterval());

    MatchesIterator mi = source.matches("field", reader.leaves().get(0), 0);
    assertNotNull(mi);
    assertTrue(mi.next());
    assertEquals(3, mi.startPosition());
    assertTrue(mi.next());
    assertEquals(6, mi.startPosition());
    assertTrue(mi.next());
    assertEquals(9, mi.startPosition());
    assertFalse(mi.next());

    reader.close();
    directory.close();

  }
 
Example 17
Source File: TestUnifiedHighlighterTermIntervals.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Before
public void doBefore() throws IOException {
  indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase
  dir = newDirectory();
}
 
Example 18
Source File: TestUnifiedHighlighterRanking.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * indexes a bunch of gibberish, and then highlights top(n).
 * asserts that top(n) highlights is a subset of top(n+1) up to some max N
 */
// TODO: this only tests single-valued fields. we should also index multiple values per field!
public void testRanking() throws Exception {
  // number of documents: we will check each one
  final int numDocs = atLeast(20);
  // number of top-N snippets, we will check 1 .. N
  final int maxTopN = 3;
  // maximum number of elements to put in a sentence.
  final int maxSentenceLength = 10;
  // maximum number of sentences in a document
  final int maxNumSentences = 20;

  Directory dir = newDirectory();
  indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
  Document document = new Document();
  Field id = new StringField("id", "", Field.Store.NO);
  Field body = new Field("body", "", fieldType);
  document.add(id);
  document.add(body);

  for (int i = 0; i < numDocs; i++) {
    StringBuilder bodyText = new StringBuilder();
    int numSentences = TestUtil.nextInt(random(), 1, maxNumSentences);
    for (int j = 0; j < numSentences; j++) {
      bodyText.append(newSentence(random(), maxSentenceLength));
    }
    body.setStringValue(bodyText.toString());
    id.setStringValue(Integer.toString(i));
    iw.addDocument(document);
  }

  IndexReader ir = iw.getReader();
  IndexSearcher searcher = newSearcher(ir);
  for (int i = 0; i < numDocs; i++) {
    checkDocument(searcher, i, maxTopN);
  }
  iw.close();
  ir.close();
  dir.close();
}
 
Example 19
Source File: TestUnifiedHighlighterMTQ.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Before
public void doBefore() throws IOException {
  dir = newDirectory();
  indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase
}
 
Example 20
Source File: TestQPHelper.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Filters MockTokenizer with StopFilter. */
@Override
public final TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
  return new TokenStreamComponents(tokenizer, new QPTestFilter(tokenizer));
}