org.apache.lucene.analysis.Analyzer Java Exaples

Source File: TestDirectSpellChecker.java From lucene-solr with Apache License 2.0

7 votes

public void testTransposition2() throws Exception {
  DirectSpellChecker spellChecker = new DirectSpellChecker();
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);

  for (int i = 0; i < 20; i++) {
    Document doc = new Document();
    doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
    writer.addDocument(doc);
  }

  IndexReader ir = writer.getReader();

  SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
      "numbers", "seevntene"), 2, ir,
      SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
  assertEquals(1, similar.length);
  assertEquals("seventeen", similar[0].string);
  
  IOUtils.close(ir, writer, dir, analyzer);
}

Source File: TestQPHelper.java From lucene-solr with Apache License 2.0

7 votes

public void testBoost() throws Exception {
  CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on"));
  Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
  StandardQueryParser qp = new StandardQueryParser();
  qp.setAnalyzer(oneStopAnalyzer);

  Query q = qp.parse("on^1.0", "field");
  assertNotNull(q);
  q = qp.parse("\"hello\"^2.0", "field");
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = qp.parse("hello^2.0", "field");
  assertNotNull(q);
  assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5);
  q = qp.parse("\"on\"^1.0", "field");
  assertNotNull(q);

  StandardQueryParser qp2 = new StandardQueryParser();
  qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));

  q = qp2.parse("the^3", "field");
  // "the" is a stop word so the result is an empty query:
  assertNotNull(q);
  assertMatchNoDocsQuery(q);
  assertFalse(q instanceof BoostQuery);
}

Source File: MinHashTest.java From minhash with Apache License 2.0

6 votes

public void test_calculate_1bit_256funcs_seed0() throws IOException {

        final int hashBit = 1;
        final int seed = 0;
        final int num = 256;
        final Analyzer minhashAnalyzer = MinHash.createAnalyzer(hashBit,
                seed, num);
        final StringBuilder[] texts = createTexts();
        final byte[][] data = createMinHashes(minhashAnalyzer, texts);

        assertEquals(1.0f, MinHash.compare(data[0], data[0]));
        assertEquals(0.90625f, MinHash.compare(data[0], data[1]));
        assertEquals(0.82421875f, MinHash.compare(data[0], data[2]));
        assertEquals(0.76953125f, MinHash.compare(data[0], data[3]));
        assertEquals(0.703125f, MinHash.compare(data[0], data[4]));
        assertEquals(0.625f, MinHash.compare(data[0], data[5]));
        assertEquals(0.6015625f, MinHash.compare(data[0], data[6]));
        assertEquals(0.55078125f, MinHash.compare(data[0], data[7]));
        assertEquals(0.53125f, MinHash.compare(data[0], data[8]));
        assertEquals(0.51171875f, MinHash.compare(data[0], data[9]));
    }

Source File: UnstemmedGermanNormalizationTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testTwo() throws Exception {
    String source = "So wird's was: das Elasticsearch-Buch erscheint beim O'Reilly-Verlag.";
    String[] expected = {
            "wird's",
            "elasticsearch-buch",
            "elasticsearchbuch",
            "erscheint",
            "o'reilly-verlag",
            "o'reillyverlag"
    };
    String resource = "unstemmed.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("default");
    assertTokenStreamContents(analyzer.tokenStream(null, new StringReader(source)), expected);
}

Source File: TestPhraseQuery.java From lucene-solr with Apache License 2.0

6 votes

public void testPhraseQueryWithStopAnalyzer() throws Exception {
  Directory directory = newDirectory();
  Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory, 
      newIndexWriterConfig(stopAnalyzer));
  Document doc = new Document();
  doc.add(newTextField("field", "the stop words are here", Field.Store.YES));
  writer.addDocument(doc);
  IndexReader reader = writer.getReader();
  writer.close();

  IndexSearcher searcher = newSearcher(reader);

  // valid exact phrase query
  PhraseQuery query = new PhraseQuery("field", "stop", "words");
  ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
  assertEquals(1, hits.length);
  QueryUtils.check(random(), query,searcher);

  reader.close();
  directory.close();
}

Source File: LanguagePrefixedTokenStream.java From SearchServices with GNU Lesser General Public License v3.0

6 votes

/**
 * Returns the {@link Analyzer} associated with the given language.
 * The proper {@link Analyzer} is retrieved from the first field type not null in the following list:
 *
 * <ul>
 *     <li>highlighted_text_ + locale (e.g. highlighted_text_en)</li>
 *     <li>text_ + locale (e.g. text_en)</li>
 *     <li>text___ (text general field)</li>
 * </ul>
 *
 * @param language the language code.
 * @return the {@link Analyzer} associated with the given language.
 */
Analyzer analyzer(String language) {
    FieldType localisedFieldType =
            ofNullable(indexSchema.getFieldTypeByName(highlightingFieldTypeName(language)))
                    .orElseGet(() -> indexSchema.getFieldTypeByName(localisedFieldTypeName(language)));

    FieldType targetFieldType =
            ofNullable(localisedFieldType)
                    .orElseGet(() ->  indexSchema.getFieldTypeByName(FALLBACK_TEXT_FIELD_TYPE_NAME));
    switch (mode)
    {
        case QUERY:
            return targetFieldType.getQueryAnalyzer();
        case INDEX:
        default:
            return targetFieldType.getIndexAnalyzer();
    }
}

Source File: LuceneDocumentRetrievalExecutor.java From bioasq with Apache License 2.0

6 votes

@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
  super.initialize(context);
  hits = UimaContextHelper.getConfigParameterIntValue(context, "hits", 100);
  // query constructor
  constructor = UimaContextHelper.createObjectFromConfigParameter(context,
          "query-string-constructor", "query-string-constructor-params",
          BooleanBagOfPhraseQueryStringConstructor.class, QueryStringConstructor.class);
  // lucene
  Analyzer analyzer = UimaContextHelper.createObjectFromConfigParameter(context, "query-analyzer",
          "query-analyzer-params", StandardAnalyzer.class, Analyzer.class);
  String[] fields = UimaContextHelper.getConfigParameterStringArrayValue(context, "fields");
  parser = new MultiFieldQueryParser(fields, analyzer);
  String index = UimaContextHelper.getConfigParameterStringValue(context, "index");
  try {
    reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
  } catch (IOException e) {
    throw new ResourceInitializationException(e);
  }
  searcher = new IndexSearcher(reader);
  idFieldName = UimaContextHelper.getConfigParameterStringValue(context, "id-field", null);
  titleFieldName = UimaContextHelper.getConfigParameterStringValue(context, "title-field", null);
  textFieldName = UimaContextHelper.getConfigParameterStringValue(context, "text-field", null);
  uriPrefix = UimaContextHelper.getConfigParameterStringValue(context, "uri-prefix", null);
}

Source File: right_IndexWriter_1.42.java From gumtree-spoon-ast-diff with Apache License 2.0

6 votes

private IndexWriter(Directory d, Analyzer a, final boolean create, boolean closeDir)
  throws IOException {
    this.closeDir = closeDir;
    directory = d;
    analyzer = a;

    Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
    if (!writeLock.obtain(WRITE_LOCK_TIMEOUT)) // obtain write lock
      throw new IOException("Index locked for write: " + writeLock);
    this.writeLock = writeLock;                   // save it

    synchronized (directory) {        // in- & inter-process sync
      new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) {
          public Object doBody() throws IOException {
            if (create)
              segmentInfos.write(directory);
            else
              segmentInfos.read(directory);
            return null;
          }
        }.run();
    }
}

Source File: IcuAnalysisTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testDefaultsIcuAnalysis() throws IOException {

        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));

        CharFilterFactory charFilterFactory = analysis.charFilter.get("icu_normalizer");
        assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));

        TokenizerFactory tf = analysis.tokenizer.get("icu_tokenizer");
        assertThat(tf, instanceOf(IcuTokenizerFactory.class));

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("icu_normalizer");
        assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));

        filterFactory = analysis.tokenFilter.get("icu_folding");
        assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));

        filterFactory = analysis.tokenFilter.get("icu_transform");
        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));

        Analyzer analyzer = analysis.indexAnalyzers.get( "icu_collation");
        assertThat(analyzer, instanceOf(NamedAnalyzer.class));
    }

Source File: TestDocValuesIndexing.java From lucene-solr with Apache License 2.0

6 votes

public void testAddSortedTwice() throws IOException {
  Analyzer analyzer = new MockAnalyzer(random());

  Directory directory = newDirectory();
  // we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1
  IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
  iwc.setMergePolicy(newLogMergePolicy());
  IndexWriter iwriter = new IndexWriter(directory, iwc);
  Document doc = new Document();
  doc.add(new SortedDocValuesField("dv", new BytesRef("foo!")));
  iwriter.addDocument(doc);
  
  doc.add(new SortedDocValuesField("dv", new BytesRef("bar!")));
  expectThrows(IllegalArgumentException.class, () -> {
    iwriter.addDocument(doc);
  });

  IndexReader ir = iwriter.getReader();
  assertEquals(1, ir.numDocs());
  ir.close();
  iwriter.close();
  directory.close();
}

Source File: PayloadUtils.java From lucene-solr with Apache License 2.0

6 votes

public static String getPayloadEncoder(FieldType fieldType) {
  // TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats
  String encoder = null;
  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof DelimitedPayloadTokenFilterFactory) {
        encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR);
        break;
      }

      if (factory instanceof NumericPayloadTokenFilterFactory) {
        // encodes using `PayloadHelper.encodeFloat(payload)`
        encoder = "float";
        break;
      }
    }
  }

  return encoder;
}

Source File: WildcardQueryNodeProcessor.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected QueryNode postProcessNode(QueryNode node) throws QueryNodeException {

  // the old Lucene Parser ignores FuzzyQueryNode that are also PrefixWildcardQueryNode or WildcardQueryNode
  // we do the same here, also ignore empty terms
  if (node instanceof FieldQueryNode || node instanceof FuzzyQueryNode) {      
    FieldQueryNode fqn = (FieldQueryNode) node;      
    CharSequence text = fqn.getText(); 
    
    // do not process wildcards for TermRangeQueryNode children and 
    // QuotedFieldQueryNode to reproduce the old parser behavior
    if (fqn.getParent() instanceof TermRangeQueryNode 
        || fqn instanceof QuotedFieldQueryNode 
        || text.length() <= 0){
      // Ignore empty terms
      return node;
    }
    
    // Code below simulates the old lucene parser behavior for wildcards
    
    
    if (isWildcard(text)) {
      Analyzer analyzer = getQueryConfigHandler().get(ConfigurationKeys.ANALYZER);
      if (analyzer != null) {
        text = analyzeWildcard(analyzer, fqn.getFieldAsString(), text.toString());
      }
      if (isPrefixWildcard(text)) {
        return new PrefixWildcardQueryNode(fqn.getField(), text, fqn.getBegin(), fqn.getEnd());
      } else {
        return new WildcardQueryNode(fqn.getField(), text, fqn.getBegin(), fqn.getEnd());
      }
    }

  }

  return node;

}

Source File: TestTeeSinkTokenFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
  Document doc = new Document();
  TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
  TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
  TokenStream sink = tee.newSinkTokenStream();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
  Field f1 = new Field("field", tee, ft);
  Field f2 = new Field("field", sink, ft);
  doc.add(f1);
  doc.add(f2);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  Terms vector = r.getTermVectors(0).terms("field");
  assertEquals(1, vector.size());
  TermsEnum termsEnum = vector.iterator();
  termsEnum.next();
  assertEquals(2, termsEnum.totalTermFreq());
  PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
  assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(2, positions.freq());
  positions.nextPosition();
  assertEquals(0, positions.startOffset());
  assertEquals(4, positions.endOffset());
  positions.nextPosition();
  assertEquals(8, positions.startOffset());
  assertEquals(12, positions.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
  r.close();
  dir.close();
  analyzer.close();
}

Source File: TestSynonymGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testBasicNotKeepOrigOneOutput() throws Exception {
  SynonymMap.Builder b = new SynonymMap.Builder();
  add(b, "a b", "x", false);

  Analyzer a = getAnalyzer(b, true);
  assertAnalyzesTo(a,
                   "c a b",
                   new String[] {"c", "x"},
                   new int[] {0, 2},
                   new int[] {1, 5},
                   new String[] {"word", "SYNONYM"},
                   new int[] {1, 1},
                   new int[] {1, 1});
  a.close();
}

Source File: TestPorterStemFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new PorterStemFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}

Source File: TestNorwegianMinimalStemFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new NorwegianMinimalStemFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}

Source File: HTMLStripCharFilterTest.java From lucene-solr with Apache License 2.0

5 votes

public void testRandomBrokenHTML() throws Exception {
  int maxNumElements = 10000;
  String text = TestUtil.randomHtmlishString(random(), maxNumElements);
  Analyzer a = newTestAnalyzer();
  checkAnalysisConsistency(random(), a, random().nextBoolean(), text);
  a.close();
}

Source File: EdismaxQueryConverter.java From solr-researcher with Apache License 2.0

5 votes

protected String[] analyze(String text, Analyzer analyzer) throws IOException {
  List<String> result = new ArrayList<String>();
  TokenStream stream = analyzer.tokenStream("", new StringReader(text));
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {
    result.add(new String(termAtt.buffer(), 0, termAtt.length()));
  }
  stream.end();
  stream.close();

  return result.toArray(new String[result.size()]);
}

Source File: SynonymTokenFilterFactory.java From crate with Apache License 2.0

5 votes

protected Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
                                        List<TokenFilterFactory> tokenFilters) {
    return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
        tokenFilters.stream()
            .map(TokenFilterFactory::getSynonymFilter)
            .toArray(TokenFilterFactory[]::new));
}

Source File: TestBrazilianAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testReusableTokenStream() throws Exception {
  Analyzer a = new BrazilianAnalyzer();
  checkReuse(a, "boa", "boa");
  checkReuse(a, "boainain", "boainain");
  checkReuse(a, "boas", "boas");
  checkReuse(a, "bôas", "boas"); // removes diacritic: different from snowball portugese
  a.close();
}

Source File: DocumentDictionaryTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testMultiValuedField() throws IOException {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random());
  IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
  iwc.setMergePolicy(newLogMergePolicy());
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);

  List<Suggestion> suggestions = indexMultiValuedDocuments(atLeast(1000), writer);
  writer.commit();
  writer.close();

  IndexReader ir = DirectoryReader.open(dir);
  Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME);
  InputIterator inputIterator = dictionary.getEntryIterator();
  BytesRef f;
  Iterator<Suggestion> suggestionsIter = suggestions.iterator();
  while((f = inputIterator.next())!=null) {
    Suggestion nextSuggestion = suggestionsIter.next();
    assertTrue(f.equals(nextSuggestion.term));
    long weight = nextSuggestion.weight;
    assertEquals(inputIterator.weight(), (weight != -1) ? weight : 0);
    assertEquals(inputIterator.payload(), nextSuggestion.payload);
    assertTrue(inputIterator.contexts().equals(nextSuggestion.contexts));
  }
  assertFalse(suggestionsIter.hasNext());
  IOUtils.close(ir, analyzer, dir);
}

Source File: TestSuggestField.java From lucene-solr with Apache License 2.0

5 votes

@Test @Slow
public void testDupSuggestFieldValues() throws Exception {
  Analyzer analyzer = new MockAnalyzer(random());
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
  final int num = Math.min(1000, atLeast(100));
  int[] weights = new int[num];
  for(int i = 0; i < num; i++) {
    Document document = new Document();
    weights[i] = random().nextInt(Integer.MAX_VALUE);
    document.add(new SuggestField("suggest_field", "abc", weights[i]));
    iw.addDocument(document);

    if (usually()) {
      iw.commit();
    }
  }

  DirectoryReader reader = iw.getReader();
  Entry[] expectedEntries = new Entry[num];
  Arrays.sort(weights);
  for (int i = 1; i <= num; i++) {
    expectedEntries[i - 1] = new Entry("abc", weights[num - i]);
  }

  SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader);
  PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "abc"));
  TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest(query, num, false);
  assertSuggestions(lookupDocs, expectedEntries);

  reader.close();
  iw.close();
}

Source File: AnalyzersTest.java From russianmorphology with Apache License 2.0

5 votes

@Test
public void shouldGiveCorrectWordsForRussian() throws IOException {
    Analyzer morphlogyAnalyzer = new RussianAnalyzer();
    String answerPath = "/russian/russian-analyzer-answer.txt";
    String testPath = "/russian/russian-analyzer-data.txt";

    testAnalayzer(morphlogyAnalyzer, answerPath, testPath);
}

Source File: SearchImpl.java From lucene-solr with Apache License 2.0

5 votes

private Query parseByClassicParser(String expression, String defField, Analyzer analyzer,
                                   QueryParserConfig config) {
  QueryParser parser = new QueryParser(defField, analyzer);

  switch (config.getDefaultOperator()) {
    case OR:
      parser.setDefaultOperator(QueryParser.Operator.OR);
      break;
    case AND:
      parser.setDefaultOperator(QueryParser.Operator.AND);
      break;
  }

  parser.setSplitOnWhitespace(config.isSplitOnWhitespace());
  parser.setAutoGenerateMultiTermSynonymsPhraseQuery(config.isAutoGenerateMultiTermSynonymsPhraseQuery());
  parser.setAutoGeneratePhraseQueries(config.isAutoGeneratePhraseQueries());
  parser.setEnablePositionIncrements(config.isEnablePositionIncrements());
  parser.setAllowLeadingWildcard(config.isAllowLeadingWildcard());
  parser.setDateResolution(config.getDateResolution());
  parser.setFuzzyMinSim(config.getFuzzyMinSim());
  parser.setFuzzyPrefixLength(config.getFuzzyPrefixLength());
  parser.setLocale(config.getLocale());
  parser.setTimeZone(config.getTimeZone());
  parser.setPhraseSlop(config.getPhraseSlop());

  try {
    return parser.parse(expression);
  } catch (ParseException e) {
    throw new LukeException(String.format(Locale.ENGLISH, "Failed to parse query expression: %s", expression), e);
  }

}

Source File: AnalyzingSuggesterTest.java From lucene-solr with Apache License 2.0

5 votes

public void testQueueExhaustion() throws Exception {
  Analyzer a = new MockAnalyzer(random());
  Directory tempDir = getDirectory();
  AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, AnalyzingSuggester.EXACT_FIRST, 256, -1, true);

  suggester.build(new InputArrayIterator(new Input[] {
        new Input("a", 2),
        new Input("a b c", 3),
        new Input("a c a", 1),
        new Input("a c b", 1),
      }));

  suggester.lookup("a", false, 4);
  IOUtils.close(a, tempDir);
}

Source File: TestArmenianAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

/** test use of exclusion set */
public void testExclude() throws IOException {
  CharArraySet exclusionSet = new CharArraySet( asSet("արծիվներ"), false);
  Analyzer a = new ArmenianAnalyzer( 
      ArmenianAnalyzer.getDefaultStopSet(), exclusionSet);
  checkOneTerm(a, "արծիվներ", "արծիվներ");
  checkOneTerm(a, "արծիվ", "արծ");
  a.close();
}

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testJapanese() throws Exception {
    Analyzer a = createAnalyzer();
    assertAnalyzesTo(a, "仮名遣い カタカナ",
            new String[] { "仮", "名", "遣", "い", "カタカナ" },
            new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
    destroyAnalzyer(a);
}

Source File: TestCatalanAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

/** test use of exclusion set */
public void testExclude() throws IOException {
  CharArraySet exclusionSet = new CharArraySet(asSet("llengües"), false);
  Analyzer a = new CatalanAnalyzer(CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
  checkOneTerm(a, "llengües", "llengües");
  checkOneTerm(a, "llengua", "llengu");
  a.close();
}

Source File: StandardIndexManager.java From nifi with Apache License 2.0

5 votes

private IndexWriterCount createWriter(final File indexDirectory) throws IOException {
    final List<Closeable> closeables = new ArrayList<>();
    final Directory directory = FSDirectory.open(indexDirectory.toPath());
    closeables.add(directory);

    try {
        final Analyzer analyzer = new StandardAnalyzer();
        closeables.add(analyzer);

        final IndexWriterConfig config = new IndexWriterConfig(analyzer);

        final ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler();
        final int mergeThreads = repoConfig.getConcurrentMergeThreads();
        mergeScheduler.setMaxMergesAndThreads(mergeThreads, mergeThreads);
        config.setMergeScheduler(mergeScheduler);

        final IndexWriter indexWriter = new IndexWriter(directory, config);
        final EventIndexWriter eventIndexWriter = new LuceneEventIndexWriter(indexWriter, indexDirectory);

        final IndexWriterCount writerCount = new IndexWriterCount(eventIndexWriter, analyzer, directory, 1, false);
        logger.debug("Providing new index writer for {}", indexDirectory);
        return writerCount;
    } catch (final IOException ioe) {
        for (final Closeable closeable : closeables) {
            try {
                closeable.close();
            } catch (final IOException ioe2) {
                ioe.addSuppressed(ioe2);
            }
        }

        throw ioe;
    }
}

Source File: MapperQueryParser.java From Elasticsearch with Apache License 2.0

5 votes

private Query getPrefixQuerySingle(String field, String termStr) throws ParseException {
    currentFieldType = null;
    Analyzer oldAnalyzer = getAnalyzer();
    try {
        currentFieldType = parseContext.fieldMapper(field);
        if (currentFieldType != null) {
            if (!forcedAnalyzer) {
                setAnalyzer(parseContext.getSearchAnalyzer(currentFieldType));
            }
            Query query = null;
            if (currentFieldType.useTermQueryWithQueryString()) {
                query = currentFieldType.prefixQuery(termStr, multiTermRewriteMethod, parseContext);
            }
            if (query == null) {
                query = getPossiblyAnalyzedPrefixQuery(currentFieldType.names().indexName(), termStr);
            }
            return query;
        }
        return getPossiblyAnalyzedPrefixQuery(field, termStr);
    } catch (RuntimeException e) {
        if (settings.lenient()) {
            return null;
        }
        throw e;
    } finally {
        setAnalyzer(oldAnalyzer);
    }
}

org.apache.lucene.analysis.Analyzer Java Examples