com.ibm.icu.text.Transliterator Java Exaples

Source File: ICUTransformFilter.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Create a new ICUTransformFilter that transforms text on the given stream.
 * 
 * @param input {@link TokenStream} to filter.
 * @param transform Transliterator to transform the text.
 */
@SuppressWarnings("deprecation")
public ICUTransformFilter(TokenStream input, Transliterator transform) {
  super(input);
  this.transform = transform;

  /* 
   * This is cheating, but speeds things up a lot.
   * If we wanted to use pkg-private APIs we could probably do better.
   */
  if (transform.getFilter() == null && transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
    final UnicodeSet sourceSet = transform.getSourceSet();
    if (sourceSet != null && !sourceSet.isEmpty())
      transform.setFilter(sourceSet);
  }
}

Source File: IcuTransformTokenFilterFactory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public IcuTransformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                      Settings settings) {
    super(indexSettings, name, settings);
    String id = settings.get("id", "Null");
    String direction = settings.get("dir", "forward");
    int dir = "forward".equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE;
    String rules = settings.get("rules");
    this.transliterator = rules != null ?
            Transliterator.createFromRules(id, rules, dir) :
            Transliterator.getInstance(id, dir);
    String unicodeSetFilter = settings.get("unicodeSetFilter");
    if (unicodeSetFilter != null) {
        transliterator.setFilter(new UnicodeSet(unicodeSetFilter).freeze());
    }
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

5 votes

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final Transliterator transform = Transliterator.getInstance("Any-Latin");
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
    }
  };
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testOptimizerSurrogate() throws Exception {
  String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
  Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
  assertTrue(custom.getFilter() == null);
  final KeywordTokenizer input = new KeywordTokenizer();
  input.setReader(new StringReader(""));
  new ICUTransformFilter(input, custom);
  assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testOptimizer() throws Exception {
  String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
  Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
  assertTrue(custom.getFilter() == null);
  final KeywordTokenizer input = new KeywordTokenizer();
  input.setReader(new StringReader(""));
  new ICUTransformFilter(input, custom);
  assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
}

Source File: UtilityExtensions.java From fitnotifications with Apache License 2.0

5 votes

/**
 * For debugging purposes; format the given text in the form
 * aaa{bbb|ccc|ddd}eee, where the {} indicate the context start
 * and limit, and the || indicate the start and limit.
 */
public static String formatInput(ReplaceableString input,
                                 Transliterator.Position pos) {
    StringBuffer appendTo = new StringBuffer();
    formatInput(appendTo, input, pos);
    return com.ibm.icu.impl.Utility.escape(appendTo.toString());
}

Source File: UtilityExtensions.java From fitnotifications with Apache License 2.0

5 votes

/**
 * For debugging purposes; format the given text in the form
 * aaa{bbb|ccc|ddd}eee, where the {} indicate the context start
 * and limit, and the || indicate the start and limit.
 */
public static StringBuffer formatInput(StringBuffer appendTo,
                                       ReplaceableString input,
                                       Transliterator.Position pos) {
    if (0 <= pos.contextStart &&
        pos.contextStart <= pos.start &&
        pos.start <= pos.limit &&
        pos.limit <= pos.contextLimit &&
        pos.contextLimit <= input.length()) {

        String  b, c, d;
        //a = input.substring(0, pos.contextStart);
        b = input.substring(pos.contextStart, pos.start);
        c = input.substring(pos.start, pos.limit);
        d = input.substring(pos.limit, pos.contextLimit);
        //e = input.substring(pos.contextLimit, input.length());
        appendTo.//append(a).
            append('{').append(b).
            append('|').append(c).append('|').append(d).
            append('}')
            //.append(e)
            ;
    } else {
        appendTo.append("INVALID Position {cs=" +
                        pos.contextStart + ", s=" + pos.start + ", l=" +
                        pos.limit + ", cl=" + pos.contextLimit + "} on " +
                        input);
    }
    return appendTo;
}

Source File: IcuNormalizer.java From enkan with Eclipse Public License 1.0

5 votes

public IcuNormalizer(String translitId) {
    try {
        transliterator = Transliterator.getInstance(translitId);
    } catch (IllegalArgumentException ex) {
        throw new MisconfigurationException("ILLEGAL_TRANSILIT_ID", translitId, ex);
    }
}

Source File: ICUTransformFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** Creates a new ICUTransformFilterFactory */
public ICUTransformFilterFactory(Map<String,String> args) {
  super(args);
  String id = require(args, "id");
  String direction = get(args, "direction", Arrays.asList("forward", "reverse"), "forward", false);
  int dir = "forward".equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE;
  transliterator = Transliterator.getInstance(id, dir);
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testBasicFunctionality() throws Exception {
  checkToken(Transliterator.getInstance("Traditional-Simplified"), 
      "簡化字", "简化字"); 
  checkToken(Transliterator.getInstance("Katakana-Hiragana"), 
      "ヒラガナ", "ひらがな");
  checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), 
      "アルアノリウ", "ｱﾙｱﾉﾘｳ");
  checkToken(Transliterator.getInstance("Any-Latin"), 
      "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
  checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"), 
      "Alphabētikós Katálogos", "Alphabetikos Katalogos");
  checkToken(Transliterator.getInstance("Han-Latin"),
      "中国", "zhōng guó");
}

Source File: TransliterationTest.java From fitnotifications with Apache License 2.0

4 votes

@Test
public void transliterateRussian() throws Exception {
    assertEquals("alfavit", Transliterator.getInstance("Any-Latin").transform("алфавит"));
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

4 votes

private void checkToken(Transliterator transform, String input, String expected) throws IOException {
  final KeywordTokenizer input1 = new KeywordTokenizer();
  input1.setReader(new StringReader(input));
  TokenStream ts = new ICUTransformFilter(input1, transform);
  assertTokenStreamContents(ts, new String[] { expected });
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testOptimizer2() throws Exception {
  checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"), 
      "ABCDE", "abcde");
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testCustomFunctionality2() throws Exception {
  String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
  checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
}

Source File: TestICUTransformFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testCustomFunctionality() throws Exception {
  String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
  checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb");
}

Source File: UtilityExtensions.java From fitnotifications with Apache License 2.0

4 votes

/**
 * Convenience method.
 */
public static StringBuffer formatInput(StringBuffer appendTo,
                                       Replaceable input,
                                       Transliterator.Position pos) {
    return formatInput(appendTo, (ReplaceableString) input, pos);
}

Source File: UtilityExtensions.java From fitnotifications with Apache License 2.0

4 votes

/**
 * Convenience method.
 */
public static String formatInput(Replaceable input,
                                 Transliterator.Position pos) {
    return formatInput((ReplaceableString) input, pos);
}

Source File: TransliterationTest.java From fitnotifications with Apache License 2.0

4 votes

@Test
public void transliterateChinese() throws Exception {
    assertEquals("zì mǔ", Transliterator.getInstance("Any-Latin").transform("字母"));
}

Source File: TransliterationTest.java From fitnotifications with Apache License 2.0

4 votes

@Test
public void transliterateKorean() throws Exception {
    assertEquals("alpabes", Transliterator.getInstance("Any-Latin").transform("알파벳"));
}

Source File: TransliterationTest.java From fitnotifications with Apache License 2.0

4 votes

@Test
public void transliterateArabic() throws Exception {
    assertEquals("ạlạ̉bjdyẗ", Transliterator.getInstance("Any-Latin").transform("الأبجدية"));
}

Source File: TransliterationTest.java From fitnotifications with Apache License 2.0

4 votes

@Test
@Ignore(value = "Transliterating Thai is not supported")
public void transliterateThai() throws Exception {
    assertEquals("tạw xạks̄ʹr", Transliterator.getInstance("Any-Latin").transform("ตัวอักษร"));
}

Source File: TransliterationTest.java From fitnotifications with Apache License 2.0

4 votes

@Test
public void transliterateGreek() throws Exception {
    assertEquals("Alphabētikós", Transliterator.getInstance("Any-Latin").transform("Αλφαβητικός"));
}

Source File: TransliterationTest.java From fitnotifications with Apache License 2.0

4 votes

@Test
public void transliterateJapanese() throws Exception {
    assertEquals("arufabetto", Transliterator.getInstance("Any-Latin").transform("アルファベット"));
}

Source File: SlugService.java From mapr-music with Apache License 2.0

4 votes

@Inject
public SlugService(@Named("artistDao") MaprDbDao<Artist> artistDao, @Named("albumDao") MaprDbDao<Album> albumDao) {
    this.artistDao = artistDao;
    this.albumDao = albumDao;
    this.transliterator = Transliterator.getInstance(ICU4J_TRANSLITERATOR_ID);
}

Source File: IcuTransformTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

2 votes

/**
 * Create a new IcuTransformFilter that transforms text on the given stream.
 *
 * @param input {@link TokenStream} to filter.
 * @param transliterator Transliterator to transform the text.
 */
public IcuTransformTokenFilter(TokenStream input, Transliterator transliterator) {
    super(input);
    this.transliterator = transliterator;
}

Source File: BeanUtils.java From apiman with Apache License 2.0

2 votes

/**
 * Creates a bean id from the given bean name.
 * @param name the name
 * @return the id
 */
public static final String idFromName(String name) {
    Transliterator tr = Transliterator.getInstance("Any-Latin; Latin-ASCII"); //$NON-NLS-1$
    return removeNonWord(tr.transliterate(name));
}

com.ibm.icu.text.Transliterator Java Examples