org.apache.lucene.analysis.Tokenizer#getAttribute

Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前" + "辣妹合唱團成員維多利亞·碧咸，亦由於他擁有" + "突出外表、百變髮型及正面的形象，以至自己" + "品牌的男士香水等商品，及長期擔任運動品牌" + "Adidas的代言人，因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力，在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因？";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: NlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}

Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因？";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

6 votes

public void testBigLookahead() throws Exception {
  StringBuilder b = new StringBuilder();
  for(int i=0;i<100;i++) {
    b.append('a');
  }
  b.append('b');
  Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

  b = new StringBuilder();
  for(int i=0;i<200;i++) {
    b.append('a');
  }
  t.setReader(new StringReader(b.toString()));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(b.toString(), termAtt.toString());
  assertFalse(t.incrementToken());
}

Source File: URLTokenizer.java From elasticsearch-analysis-url with Apache License 2.0

5 votes

/**
 * Get a list of {@link Token}s from the given {@link Tokenizer}
 * @param part the url part which should be used in {@link Token} creation
 * @param tokenizer the tokenizer from which tokens will be gleaned
 * @return a list of tokens
 * @throws IOException
 */
private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
    tokenizer.reset();
    List<Token> tokens = new ArrayList<>();
    OffsetAttribute offset;
    String token;
    while (tokenizer.incrementToken()) {
        token = tokenizer.getAttribute(CharTermAttribute.class).toString();
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset()));
    }
    return tokens;
}

Source File: HanLPTokenizerFactoryTest.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

public void testCreate() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前" +
                                                 "辣妹合唱團成員維多利亞·碧咸，亦由於他擁有" +
                                                 "突出外表、百變髮型及正面的形象，以至自己" +
                                                 "品牌的男士香水等商品，及長期擔任運動品牌" +
                                                 "Adidas的代言人，因此對大眾傳播媒介和時尚界" +
                                                 "等方面都具很大的影響力，在足球圈外所獲得的" +
                                                 "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyStringPatternOneMatch() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbab"));
  assertTokenStreamContents(t,
                            new String[] {"bb", "b"},
                            new int[] {0, 3},
                            new int[] {2, 4});
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testTrailingNonToken() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("a c   "));
  assertTokenStreamContents(t,
                            new String[] {"a", "c"},
                            new int[] {0, 2},
                            new int[] {1, 3});
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testLeadingNonToken() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("    a c"));
  assertTokenStreamContents(t,
                            new String[] {"a", "c"},
                            new int[] {4, 6},
                            new int[] {5, 7});
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testSplitMultiCharWhitespace() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("a \tb   c"));
  assertTokenStreamContents(t,
                            new String[] {"a", "b", "c"},
                            new int[] {0, 3, 7},
                            new int[] {1, 4, 8});
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testSplitSingleCharWhitespace() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("a \tb   c"));
  assertTokenStreamContents(t,
                            new String[] {"a", "b", "c"},
                            new int[] {0, 3, 7},
                            new int[] {1, 4, 8});
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyStringPatternNoMatch() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testNoTokens() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertFalse(t.incrementToken());
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("aaa", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyStringPatternOneMatch() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbab"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("a", termAtt.toString());
  assertFalse(t.incrementToken());
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testOneToken() throws Exception {
  Tokenizer t = new SimplePatternTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(s, termAtt.toString());
}

Java Code Examples for org.apache.lucene.analysis.Tokenizer#getAttribute()