org.apache.lucene.analysis.tokenattributes.TypeAttribute Java Exaples

Source File: TestNumericTokenStream.java From lucene-solr with Apache License 2.0

6 votes

public void testIntStream() throws Exception {
  @SuppressWarnings("resource")
  final LegacyNumericTokenStream stream=new LegacyNumericTokenStream().setIntValue(ivalue);
  final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  assertNotNull(bytesAtt);
  final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
  assertNotNull(typeAtt);
  final LegacyNumericTokenStream.LegacyNumericTermAttribute numericAtt = stream.getAttribute(LegacyNumericTokenStream.LegacyNumericTermAttribute.class);
  assertNotNull(numericAtt);
  stream.reset();
  assertEquals(32, numericAtt.getValueSize());
  for (int shift=0; shift<32; shift+= LegacyNumericUtils.PRECISION_STEP_DEFAULT) {
    assertTrue("New token is available", stream.incrementToken());
    assertEquals("Shift value wrong", shift, numericAtt.getShift());
    assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), LegacyNumericUtils.prefixCodedToInt(bytesAtt.getBytesRef()));
    assertEquals("Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue());
    assertEquals("Type incorrect", (shift == 0) ? LegacyNumericTokenStream.TOKEN_TYPE_FULL_PREC : LegacyNumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
  }
  assertFalse("More tokens available", stream.incrementToken());
  stream.end();
  stream.close();
}

Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testIssue() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因？";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: PathTokenFilterTest.java From SearchServices with GNU Lesser General Public License v3.0

6 votes

public void testAttributesAfterStreamEnd() throws IOException
{
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    
    // PathTokenFilter.end() will be called after all tokens consumed.
    tokenise(ts, new String[]{"uri1", "one"});
    
    // Check attributes cleaned up
    assertEquals("", termAtt.toString());
    assertEquals("word", typeAtt.type()); // the default
    assertEquals(0, posIncAtt.getPositionIncrement());
    // Final offset...
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
}

Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testPinyinTokenFilter() throws Exception
{
    Map<String, String> args = new HashMap<>();
    args.put("original", "true");
    args.put("pinyin", "false");
    args.put("pinyinFirstChar", "true");
    HanLPPinyinTokenFilterFactory factory = new HanLPPinyinTokenFilterFactory(args);
    TokenStream tokenStream = factory.create(tokenizer);
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: HanLPIndexAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPIndexAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: URLTokenFilter.java From elasticsearch-analysis-url with Apache License 2.0

6 votes

/**
 * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
 * will be passed along to the tokenizer.
 * @param input a string to be tokenized
 * @return a list of tokens extracted from the input string
 * @throws IOException
 */
private List<Token> tokenize(String input) throws IOException {
    List<Token> tokens = new ArrayList<>();
    URLTokenizer tokenizer = new URLTokenizer();
    // create a copy of the parts list to avoid ConcurrentModificationException when sorting
    tokenizer.setParts(new ArrayList<>(parts));
    tokenizer.setUrlDecode(urlDeocde);
    tokenizer.setTokenizeHost(tokenizeHost);
    tokenizer.setTokenizePath(tokenizePath);
    tokenizer.setTokenizeQuery(tokenizeQuery);
    tokenizer.setAllowMalformed(allowMalformed || passthrough);
    tokenizer.setTokenizeMalformed(tokenizeMalformed);
    tokenizer.setReader(new StringReader(input));
    tokenizer.reset();

    String term;
    URLPart part;
    OffsetAttribute offset;
    while (tokenizer.incrementToken()) {
        term = tokenizer.getAttribute(CharTermAttribute.class).toString();
        part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
    }
    return tokens;
}

Source File: ShingleFilterTest.java From lucene-solr with Apache License 2.0

6 votes

public void testReset() throws Exception {
  Tokenizer wsTokenizer = new WhitespaceTokenizer();
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  TokenStream filter = new ShingleFilter(wsTokenizer, 2);
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
}

Source File: TestSnowball.java From lucene-solr with Apache License 2.0

6 votes

public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
  
  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}

Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}

Source File: SpellingQueryConverter.java From lucene-solr with Apache License 2.0

6 votes

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}

Source File: TokenStreamAssertions.java From elasticsearch-analysis-openkoreantext with Apache License 2.0

6 votes

public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException {
    tokenStream.reset();
    int index = 0;
    while (tokenStream.incrementToken() == true) {
        assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString());

        if(expectedTypes != null) {
            assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type());
        }

        OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class);

        if(expectedStartOffsets != null) {
            assertEquals(expectedStartOffsets[index], offsets.startOffset());
        }

        if(expectedEndOffsets != null) {
            assertEquals(expectedEndOffsets[index], offsets.endOffset());
        }

        index++;
    }
    tokenStream.end();
}

Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0

6 votes

private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
            lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}

Source File: HanLpTokenizerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testPinyinTokenFilter() throws Exception {
    Map<String, String> args = new HashMap<>();
    args.put("original", "true");
    args.put("pinyin", "false");
    args.put("pinyinFirstChar", "true");
    HanLpPinyinTokenFilterFactory factory = new HanLpPinyinTokenFilterFactory(args);
    TokenStream tokenStream = factory.create(tokenizer);
    while (tokenStream.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: TestNumericTokenStream.java From lucene-solr with Apache License 2.0

6 votes

public void testLongStream() throws Exception {
  @SuppressWarnings("resource")
  final LegacyNumericTokenStream stream=new LegacyNumericTokenStream().setLongValue(lvalue);
  final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  assertNotNull(bytesAtt);
  final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
  assertNotNull(typeAtt);
  final LegacyNumericTokenStream.LegacyNumericTermAttribute numericAtt = stream.getAttribute(LegacyNumericTokenStream.LegacyNumericTermAttribute.class);
  assertNotNull(numericAtt);
  stream.reset();
  assertEquals(64, numericAtt.getValueSize());
  for (int shift=0; shift<64; shift+= LegacyNumericUtils.PRECISION_STEP_DEFAULT) {
    assertTrue("New token is available", stream.incrementToken());
    assertEquals("Shift value wrong", shift, numericAtt.getShift());
    assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), LegacyNumericUtils.prefixCodedToLong(bytesAtt.getBytesRef()));
    assertEquals("Term raw value is incorrectly encoded", lvalue & ~((1L << shift) - 1L), numericAtt.getRawValue());
    assertEquals("Type incorrect", (shift == 0) ? LegacyNumericTokenStream.TOKEN_TYPE_FULL_PREC : LegacyNumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
  }
  assertFalse("More tokens available", stream.incrementToken());
  stream.end();
  stream.close();
}

Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因？";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testCreateComponents() throws Exception {
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i) {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    try (Analyzer analyzer = new HanLpQueryAnalyzer("viterbi")) {
        TokenStream tokenStream = analyzer.tokenStream("field", text);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}

Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前" + "辣妹合唱團成員維多利亞·碧咸，亦由於他擁有" + "突出外表、百變髮型及正面的形象，以至自己" + "品牌的男士香水等商品，及長期擔任運動品牌" + "Adidas的代言人，因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力，在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: HanLpIndexAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testCreateComponents() throws Exception {
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i) {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    try (Analyzer analyzer = new HanLpIndexAnalyzer("viterbi")) {
        TokenStream tokenStream = analyzer.tokenStream("field", text);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}

Source File: NlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}

Source File: LuceneUtil.java From antsdb with GNU Lesser General Public License v3.0

5 votes

static void tokenize(String text, BiConsumer<String, String> lambda) {
	try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
		TokenStream stream = analyzer.tokenStream("", text);
		CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
		TypeAttribute type = stream.getAttribute(TypeAttribute.class);
		stream.reset();
		while (stream.incrementToken()) {
			lambda.accept(type.type(), term.toString());
		}
	}
	catch (IOException x) {
		throw new RuntimeException(x);
	}
}

Source File: BosonNLPTokenizer.java From elasticsearch-analysis-bosonnlp with Apache License 2.0

5 votes

/**
 * Lucene constructor
 * 
 * @throws UnirestException
 * @throws JSONException
 * @throws IOException
 */
public BosonNLPTokenizer(String URL, String BAT, int spaceMode, int oovLevel, int t2s, int specialCharConv)
        throws IOException, JSONException, UnirestException {
    super();
    // Add token offset attribute
    offsetAttr = addAttribute(OffsetAttribute.class);
    // Add token content attribute
    charTermAttr = addAttribute(CharTermAttribute.class);
    // Add token type attribute
    typeAttr = addAttribute(TypeAttribute.class);
    // Add token position attribute
    piAttr = addAttribute(PositionIncrementAttribute.class);
    // Create a new word segmenter to get tokens
    BosonSeg = new BosonNLPWordSegmenter(input, URL, BAT, spaceMode, oovLevel, t2s, specialCharConv);
}

Source File: SimpleQueryConverter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Collection<Token> convert(String origQuery) {
  Collection<Token> result = new HashSet<>();

  try (WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream("", origQuery)) {
    // TODO: support custom attributes
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
    PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

    ts.reset();

    while (ts.incrementToken()) {
      Token tok = new Token();
      tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
      tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
      tok.setFlags(flagsAtt.getFlags());
      tok.setPayload(payloadAtt.getPayload());
      tok.setPositionIncrement(posIncAtt.getPositionIncrement());
      tok.setType(typeAtt.type());
      result.add(tok);
    }
    ts.end();      
    return result;
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}

Source File: HanLPTokenizerFactoryTest.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

public void testCreate() throws Exception
{
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLPTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前" +
                                                 "辣妹合唱團成員維多利亞·碧咸，亦由於他擁有" +
                                                 "突出外表、百變髮型及正面的形象，以至自己" +
                                                 "品牌的男士香水等商品，及長期擔任運動品牌" +
                                                 "Adidas的代言人，因此對大眾傳播媒介和時尚界" +
                                                 "等方面都具很大的影響力，在足球圈外所獲得的" +
                                                 "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: TokenizerRunTest.java From KOMORAN with Apache License 2.0

5 votes

@Test
public void analyzeTest() throws Exception{
    String testSource = "저는 이번에 바람과 함께 사라지다를 봤어요";
    KomoranTokenizer tokenStream = new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE));
    tokenStream.setReader(new StringReader(testSource));
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        System.out.println(tokenStream.getAttribute(CharTermAttribute.class));
        System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());
        System.out.println(tokenStream.getAttribute(OffsetAttribute.class).startOffset()
                +" : "+tokenStream.getAttribute(OffsetAttribute.class).endOffset());
    }
    tokenStream.end();
    tokenStream.close();
}

Source File: HanLPTokenizerTest.java From hanlp-lucene-plugin with Apache License 2.0

5 votes

public void testIncrementToken() throws Exception
{
    while (tokenizer.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: AnsjTokenizer.java From ansj4solr with Apache License 2.0

5 votes

public AnsjTokenizer(Reader input, int analysisType, boolean removePunc) {
	super(input);
	offsetAtt = addAttribute(OffsetAttribute.class);
	termAtt = addAttribute(CharTermAttribute.class);
	typeAtt = addAttribute(TypeAttribute.class);
	positionIncrementAtt = addAttribute(PositionIncrementAttribute.class);
	this.analysisType = analysisType;
	this.removePunc = removePunc;
}

Source File: IKTokenizer.java From es-ik with Apache License 2.0

5 votes

public IKTokenizer(Reader in, DictionaryConfiguration configuration) {
    super(in);
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
    typeAtt = addAttribute(TypeAttribute.class);
    _IKImplement = new IKSegmenter(input, configuration);
}

Source File: MMSegTokenizer.java From mmseg4j-solr with Apache License 2.0

5 votes

public MMSegTokenizer(Seg seg) {
	this.seg = seg;

	termAtt = addAttribute(CharTermAttribute.class);
	offsetAtt = addAttribute(OffsetAttribute.class);
	typeAtt = addAttribute(TypeAttribute.class);
}

Source File: CutLetterDigitFilter.java From mmseg4j-solr with Apache License 2.0

5 votes

public CutLetterDigitFilter(TokenStream input) {
	super(input);

	reusableToken = new PackedTokenAttributeImpl();
	termAtt = addAttribute(CharTermAttribute.class);
	offsetAtt = addAttribute(OffsetAttribute.class);
	typeAtt = addAttribute(TypeAttribute.class);
}

org.apache.lucene.analysis.tokenattributes.TypeAttribute Java Examples