org.apache.lucene.analysis.Tokenizer#reset

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

6 votes

public void testBigLookahead() throws Exception {
  StringBuilder b = new StringBuilder();
  for(int i=0;i<100;i++) {
    b.append('a');
  }
  b.append('b');
  Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

  b = new StringBuilder();
  for(int i=0;i<200;i++) {
    b.append('a');
  }
  t.setReader(new StringReader(b.toString()));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(b.toString(), termAtt.toString());
  assertFalse(t.incrementToken());
}

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

6 votes

private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

6 votes

private ArrayList<char[]> tokenize( String input ) throws IOException {

    Log.debug( "tokenize '" + input + "'" );
    ArrayList<char[]> tokens = new ArrayList<char[]>( );
    Tokenizer tk = getTokenizerImpl( input );

    CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
    tk.reset( );
    while (tk.incrementToken( ) ) {
      int bufLen = term.length();
      char[] copy = new char[ bufLen ];
      System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
      tokens.add( copy );
    }

    return tokens;
  }

Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因？";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: NlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}

Source File: MeCabKoStandardTokenizerTest.java From mecab-ko-lucene-analyzer with Apache License 2.0

6 votes

@Test
public void testShortSentence() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("꽃배달 꽃망울 오토바이"), 2);
  assertEquals(
      "꽃:N:NNG:null:1:1:0:1,배달:N:NNG:null:1:1:1:3,"
      + "꽃:N:NNG:null:1:1:4:5,꽃망울:COMPOUND:Compound:null:0:2:4:7,"
      + "망울:N:NNG:null:1:1:5:7,오토바이:N:NNG:null:1:1:8:12,",
      tokenizerToString(tokenizer));
 
  tokenizer.reset();
  tokenizer.setReader(new StringReader("소설 무궁화꽃이 피었습니다."));
  assertEquals(
      "소설:N:NNG:null:1:1:0:2,무궁:N:NNG:null:1:1:3:5,"
      + "무궁화:COMPOUND:Compound:null:0:2:3:6,화:N:NNG:null:1:1:5:6,"
      + "꽃이:EOJEOL:NNG+JKS:null:1:1:6:8,꽃:N:NNG:null:0:1:6:7,"
      + "피었습니다:EOJEOL:VV+EP+EF:null:1:1:9:14,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

6 votes

public void testBigLookahead() throws Exception {
  StringBuilder b = new StringBuilder();
  for(int i=0;i<100;i++) {
    b.append('a');
  }
  b.append('b');
  Tokenizer t = new SimplePatternTokenizer(b.toString());

  b = new StringBuilder();
  for(int i=0;i<200;i++) {
    b.append('a');
  }
  t.setReader(new StringReader(b.toString()));
  t.reset();
  assertFalse(t.incrementToken());
}

Source File: TestOpenNLPTokenizerFactory.java From lucene-solr with Apache License 2.0

6 votes

@Test
public void testClose() throws IOException {
  Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin");
                                                            put("tokenizerModel", "en-test-tokenizer.bin"); }};
  OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
  factory.inform(new ClasspathResourceLoader(getClass()));

  Tokenizer ts = factory.create(newAttributeFactory());
  ts.setReader(new StringReader(SENTENCES));

  ts.reset();
  ts.close();
  ts.reset();
  ts.setReader(new StringReader(SENTENCES));
  assertTokenStreamContents(ts, SENTENCES_punc);
  ts.close();
  ts.reset();
  ts.setReader(new StringReader(SENTENCES));
  assertTokenStreamContents(ts, SENTENCES_punc);
}

Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因？";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前" + "辣妹合唱團成員維多利亞·碧咸，亦由於他擁有" + "突出外表、百變髮型及正面的形象，以至自己" + "品牌的男士香水等商品，及長期擔任運動品牌" + "Adidas的代言人，因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力，在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyStringPatternOneMatch() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbab"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("a", termAtt.toString());
  assertFalse(t.incrementToken());
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("aaa", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testOneToken() throws Exception {
  Tokenizer t = new SimplePatternTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals(s, termAtt.toString());
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testNoTokens() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer(".*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  String s;
  while (true) {
    s = TestUtil.randomUnicodeString(random());
    if (s.length() > 0) {
      break;
    }
  }
  t.setReader(new StringReader(s));
  t.reset();
  assertFalse(t.incrementToken());
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEmptyStringPatternNoMatch() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a*");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  t.setReader(new StringReader("bbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
}

Source File: TestSimplePatternSplitTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffset() throws Exception {
  Tokenizer t = new SimplePatternSplitTokenizer("a+");
  CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
  t.setReader(new StringReader("aaabbb"));
  t.reset();
  assertTrue(t.incrementToken());
  assertEquals("bbb", termAtt.toString());
  assertFalse(t.incrementToken());
  t.end();
  assertEquals(6, offsetAtt.endOffset());
}

Source File: HanLPTokenizerFactoryTest.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

public void testCreate() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前" +
                                                 "辣妹合唱團成員維多利亞·碧咸，亦由於他擁有" +
                                                 "突出外表、百變髮型及正面的形象，以至自己" +
                                                 "品牌的男士香水等商品，及長期擔任運動品牌" +
                                                 "Adidas的代言人，因此對大眾傳播媒介和時尚界" +
                                                 "等方面都具很大的影響力，在足球圈外所獲得的" +
                                                 "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: URLTokenizer.java From elasticsearch-analysis-url with Apache License 2.0

5 votes

/**
 * Get a list of {@link Token}s from the given {@link Tokenizer}
 * @param part the url part which should be used in {@link Token} creation
 * @param tokenizer the tokenizer from which tokens will be gleaned
 * @return a list of tokens
 * @throws IOException
 */
private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
    tokenizer.reset();
    List<Token> tokens = new ArrayList<>();
    OffsetAttribute offset;
    String token;
    while (tokenizer.incrementToken()) {
        token = tokenizer.getAttribute(CharTermAttribute.class).toString();
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(token, part, start + offset.startOffset(), start + offset.endOffset()));
    }
    return tokens;
}

Source File: TestSimplePatternTokenizer.java From lucene-solr with Apache License 2.0

4 votes

public void testEmptyStringPatternNoMatch() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("a*");
  t.setReader(new StringReader("bbb"));
  t.reset();
  assertFalse(t.incrementToken());
}

Source File: NGramTokenizerTest.java From lucene-solr with Apache License 2.0

4 votes

static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
  // convert the string to code points
  final int[] codePoints = toCodePoints(s);
  final int[] offsets = new int[codePoints.length + 1];
  for (int i = 0; i < codePoints.length; ++i) {
    offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
  }
  final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {
    @Override
    protected boolean isTokenChar(int chr) {
      return nonTokenChars.indexOf(chr) < 0;
    }
  };
  grams.setReader(new StringReader(s));
  final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
  final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
  final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
  grams.reset();
  for (int start = 0; start < codePoints.length; ++start) {
    nextGram:
    for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
      if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
        // not on an edge
        continue nextGram;
      }
      for (int j = start; j < end; ++j) {
        if (!isTokenChar(nonTokenChars, codePoints[j])) {
          continue nextGram;
        }
      }
      assertTrue(grams.incrementToken());
      assertArrayEquals(ArrayUtil.copyOfSubArray(codePoints, start, end), toCodePoints(termAtt));
      assertEquals(1, posIncAtt.getPositionIncrement());
      assertEquals(1, posLenAtt.getPositionLength());
      assertEquals(offsets[start], offsetAtt.startOffset());
      assertEquals(offsets[end], offsetAtt.endOffset());
    }
  }
  assertFalse(grams.incrementToken());
  grams.end();
  assertEquals(s.length(), offsetAtt.startOffset());
  assertEquals(s.length(), offsetAtt.endOffset());
}

Java Code Examples for org.apache.lucene.analysis.Tokenizer#reset()