com.ibm.icu.text.Normalizer2#getInstance

Source File: ICUNormalizer2CharFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

/** Creates a new ICUNormalizer2CharFilterFactory */
public ICUNormalizer2CharFilterFactory(Map<String,String> args) {
  super(args);
  String form = get(args, "form", "nfkc_cf");
  String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
  Normalizer2 normalizer = Normalizer2.getInstance
      (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
  
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}

Source File: ICUNormalizer2FilterFactory.java From lucene-solr with Apache License 2.0

6 votes

/** Creates a new ICUNormalizer2FilterFactory */
public ICUNormalizer2FilterFactory(Map<String,String> args) {
  super(args);
  String form = get(args, "form", "nfkc_cf");
  String mode = get(args, "mode", Arrays.asList("compose", "decompose"), "compose");
  Normalizer2 normalizer = Normalizer2.getInstance
      (null, form, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
  
  String filter = get(args, "filter");
  if (filter != null) {
    UnicodeSet set = new UnicodeSet(filter);
    if (!set.isEmpty()) {
      set.freeze();
      normalizer = new FilteredNormalizer2(normalizer, set);
    }
  }
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
  this.normalizer = normalizer;
}

Source File: TestICUNormalizer2Filter.java From lucene-solr with Apache License 2.0

6 votes

public void testAlternate() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(
          tokenizer,
          /* specify nfc with decompose to get nfd */
          Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
    }
  };
  
  // decompose EAcute into E + combining Acute
  assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
  a.close();
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testNormalization() throws IOException {
  String input = "ʰ㌰゙5℃№㈱㌘，バッファーの正規化のテスト．㋐㋑㋒㋓㋔ｶｷｸｹｺｻﾞｼﾞｽﾞｾﾞｿﾞg̈각/각நிเกषिchkʷक्षि";
  Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
  String expectedOutput = normalizer.normalize(input);

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
  char[] tempBuff = new char[10];
  StringBuilder output = new StringBuilder();
  while (true) {
    int length = reader.read(tempBuff);
    if (length == -1) {
      break;
    }
    output.append(tempBuff, 0, length);
    assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
  }

  assertEquals(expectedOutput, output.toString());
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testTokenStream() throws IOException {
  // '℃', '№', '㈱', '㌘', 'ｻ'+'<<', 'ｿ'+'<<', '㌰'+'<<'
  String input = "℃ № ㈱ ㌘ ｻﾞ ｿﾞ ㌰ﾞ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
    new int[] {0, 2, 4, 6, 8, 11, 14},
    new int[] {1, 3, 5, 7, 10, 13, 16},
    input.length());
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'ｻ', '<<', 'ｿ', '<<'
  String input = "㌰゙5℃№㈱㌘ｻﾞｿﾞ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}

Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}

Source File: IcuNormalizerCharFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                      Settings settings) {
    super(indexSettings, name);
    Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
            getNormalizationName(settings), getNormalizationMode(settings));
    String unicodeSetFilter = settings.get("unicode_set_filter");
    this.normalizer = unicodeSetFilter != null ?
            new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
}

Source File: IcuNormalizerTokenFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                       Settings settings) {
    super(indexSettings, name, settings);

    Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
            getNormalizationName(settings), getNormalizationMode(settings));

    String unicodeSetFilter = settings.get("unicode_set_filter");
    this.normalizer = unicodeSetFilter != null ?
            new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
}

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}

Source File: IcuNormalizerFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testAlternate() throws Exception {
    Analyzer a = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new IcuNormalizerFilter(
                    tokenizer,
                    Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
        }
    };
    assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
    a.close();
}

Source File: IcuNormalizerFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testEmptyTerm() throws Exception {
    Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer,
                    new IcuNormalizerFilter(tokenizer,
                            Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}

Source File: Normalizer2Factory.java From oacc-core with Apache License 2.0

4 votes

public static Normalizer2 getNFCInstance() {
   return Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
}

Source File: ICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Create a new Normalizer2CharFilter that combines NFKC normalization, Case
 * Folding, and removes Default Ignorables (NFKC_Casefold)
 */
public ICUNormalizer2CharFilter(Reader in) {
  this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
}

Source File: ICUNormalizer2Filter.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Create a new Normalizer2Filter that combines NFKC normalization, Case
 * Folding, and removes Default Ignorables (NFKC_Casefold)
 */
public ICUNormalizer2Filter(TokenStream input) {
  this(input, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
}

Java Code Examples for com.ibm.icu.text.Normalizer2#getInstance()