org.apache.lucene.analysis.StopFilter#makeStopSet

Source File: TestFreeTextSuggester.java From lucene-solr with Apache License 2.0

6 votes

public void testEndingHole() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field) {
        Tokenizer tokenizer = new MockTokenizer();
        CharArraySet stopSet = StopFilter.makeStopSet("of");
        return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
      new Input("wizard of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("wizard _ oz/1.00",
               toString(sug.lookup("wizard of", 10)));

  // Falls back to unigram model, with backoff 0.4 times
  // prop 0.5:
  assertEquals("oz/0.20",
               toString(sug.lookup("wizard o", 10)));
  a.close();
}

Source File: TestFreeTextSuggester.java From lucene-solr with Apache License 2.0

6 votes

public void testTwoEndingHoles() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field) {
        Tokenizer tokenizer = new MockTokenizer();
        CharArraySet stopSet = StopFilter.makeStopSet("of");
        return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
      new Input("wizard of of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("",
               toString(sug.lookup("wizard of of", 10)));
  a.close();
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testEndIsStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMidStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "school"},
                            new int[] {0, 6},
                            new int[] {2, 12},
                            null,
                            new int[] {1, 2},
                            null,
                            12,
                            new boolean[] {false, false},
                            true);
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMultipleStopWords() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to a the school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "school" },
                            new int[] {0, 12},
                            new int[] {2, 18},
                            null,
                            new int[] {1, 4},
                            null,
                            18,
                            new boolean[] {false, false},
                            true);
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMultipleStopWordsEnd() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to a the"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "the"},
                            new int[] {0, 8},
                            new int[] {2, 11},
                            null,
                            new int[] {1, 3},
                            null,
                            11,
                            new boolean[] {false, true},
                            true);
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMultipleStopWordsEnd2() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to a the "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            12,
                            new boolean[] {false},
                            true);
}

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

6 votes

@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testSeparatorWithStopWords() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  String input = "A B C D E F J H";
  tokenStream.setReader(new StringReader(input));
  TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);

  assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
}

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testSeparatorWithStopWordsAndPreservePositionIncrements() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  String input = "A B C D E F J H";
  tokenStream.setReader(new StringReader(input));
  TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', true, 100);

  assertTokenStreamContents(stream, new String[] {"-B-C---F--H"}, null, null, new int[] { 1 });
}

Java Code Examples for org.apache.lucene.analysis.StopFilter#makeStopSet()