org.apache.lucene.analysis.tokenattributes.OffsetAttribute Java Exaples

Source File: FieldType.java From lucene-solr with Apache License 2.0

6 votes

@Override
public TokenStreamComponents createComponents(String fieldName) {
  Tokenizer ts = new Tokenizer() {
    final char[] cbuf = new char[maxChars];
    final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    final BytesTermAttribute bytesAtt = isPointField() ? addAttribute(BytesTermAttribute.class) : null;
    final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    @Override
    public boolean incrementToken() throws IOException {
      clearAttributes();
      int n = input.read(cbuf,0,maxChars);
      if (n<=0) return false;
      if (isPointField()) {
        BytesRef b = ((PointField)FieldType.this).toInternalByteRef(new String(cbuf, 0, n));
        bytesAtt.setBytesRef(b);
      } else {
        String s = toInternal(new String(cbuf, 0, n));
        termAtt.setEmpty().append(s);
      }
      offsetAtt.setOffset(correctOffset(0),correctOffset(n));
      return true;
    }
  };

  return new TokenStreamComponents(ts);
}

Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testCreateComponents() throws Exception {
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i) {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    try (Analyzer analyzer = new HanLpQueryAnalyzer("viterbi")) {
        TokenStream tokenStream = analyzer.tokenStream("field", text);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}

Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}

Source File: Tagger.java From SolrTextTagger with Apache License 2.0

6 votes

public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
              TagClusterReducer tagClusterReducer, boolean skipAltTokens,
              boolean ignoreStopWords) throws IOException {
  this.terms = terms;
  this.liveDocs = liveDocs;
  this.tokenStream = tokenStream;
  this.skipAltTokens = skipAltTokens;
  this.ignoreStopWords = ignoreStopWords;
  byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
  posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
  taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
  tokenStream.reset();

  this.tagClusterReducer = tagClusterReducer;
}

Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testCreate() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    TokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create(null);

    tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員，球場以外，其妻為前" + "辣妹合唱團成員維多利亞·碧咸，亦由於他擁有" + "突出外表、百變髮型及正面的形象，以至自己" + "品牌的男士香水等商品，及長期擔任運動品牌" + "Adidas的代言人，因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力，在足球圈外所獲得的" + "認受程度可謂前所未見。"));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: URLTokenFilter.java From elasticsearch-analysis-url with Apache License 2.0

6 votes

/**
 * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
 * will be passed along to the tokenizer.
 * @param input a string to be tokenized
 * @return a list of tokens extracted from the input string
 * @throws IOException
 */
private List<Token> tokenize(String input) throws IOException {
    List<Token> tokens = new ArrayList<>();
    URLTokenizer tokenizer = new URLTokenizer();
    // create a copy of the parts list to avoid ConcurrentModificationException when sorting
    tokenizer.setParts(new ArrayList<>(parts));
    tokenizer.setUrlDecode(urlDeocde);
    tokenizer.setTokenizeHost(tokenizeHost);
    tokenizer.setTokenizePath(tokenizePath);
    tokenizer.setTokenizeQuery(tokenizeQuery);
    tokenizer.setAllowMalformed(allowMalformed || passthrough);
    tokenizer.setTokenizeMalformed(tokenizeMalformed);
    tokenizer.setReader(new StringReader(input));
    tokenizer.reset();

    String term;
    URLPart part;
    OffsetAttribute offset;
    while (tokenizer.incrementToken()) {
        term = tokenizer.getAttribute(CharTermAttribute.class).toString();
        part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
        offset = tokenizer.getAttribute(OffsetAttribute.class);
        tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
    }
    return tokens;
}

Source File: SpellingQueryConverter.java From lucene-solr with Apache License 2.0

6 votes

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}

Source File: NlpSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testSegmenter() throws Exception {
    Tokenizer segmenter = getSegmenter();
    String text = "中华人民共和国(People's Republic of China),简称'中国'";
    segmenter.setReader(new StringReader(text));
    segmenter.reset();
    while (segmenter.incrementToken()) {
        // 词元
        CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute type = segmenter.getAttribute(TypeAttribute.class);
        LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset()));
        Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase());
    }
}

Source File: HanLpQueryAnalyzerTestCase.java From jstarcraft-nlp with Apache License 2.0

6 votes

@Test
public void testIssue() throws Exception {
    Map<String, String> args = new TreeMap<>();
    args.put("enableTraditionalChineseMode", "true");
    args.put("enableNormalization", "true");
    HanLpTokenizerFactory factory = new HanLpTokenizerFactory(args);
    Tokenizer tokenizer = factory.create();
    String text = "會辦台星保證最低價的原因？";

    tokenizer.setReader(new StringReader(text));
    tokenizer.reset();
    while (tokenizer.incrementToken()) {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: LuceneToken.java From jstarcraft-nlp with Apache License 2.0

6 votes

public LuceneToken(TokenStream stream) {
    this.stream = stream;
    this.term = stream.getAttribute(CharTermAttribute.class);
    this.offset = stream.getAttribute(OffsetAttribute.class);
    try {
        this.flag = this.stream.incrementToken();
        if (!flag) {
            this.stream.close();
        }
    } catch (Exception exception) {
        try {
            this.stream.close();
        } catch (Exception throwable) {
        }
        throw new RuntimeException(exception);
    }
}

Source File: TestDuelingAnalyzers.java From lucene-solr with Apache License 2.0

6 votes

public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
  left.reset();
  right.reset();
  CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
  CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
  OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
  OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
  PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
  
  while (left.incrementToken()) {
    assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
    assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
    assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
    assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
    assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  };
  assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
  left.end();
  right.end();
  assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  left.close();
  right.close();
}

Source File: DexterAnalyzer.java From dexter with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
	String str = "<body>perchééééééééé";
	Analyzer anal = new DexterAnalyzer();
	TokenStream ts = anal.tokenStream("content", new StringReader(str));

	OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
	CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
	ts.reset();
	while (ts.incrementToken()) {
		System.out.println(termAtt.toString()
				.substring(0, termAtt.length()));
		System.out
				.println("token start offset: " + offsetAtt.startOffset());
		System.out.println("  token end offset: " + offsetAtt.endOffset());
	}
}

Source File: PlainHighlighter.java From Elasticsearch with Apache License 2.0

6 votes

private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}

Source File: LuceneUtil.java From jasperreports with GNU Lesser General Public License v3.0

6 votes

protected String displayTokens(String text, String elementId) throws IOException {
	Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);;
	StringBuilder sb = new StringBuilder();
	sb.append(elementId).append(": ").append(text).append(": ");

	TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
	CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);

	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		int startOffset = offsetAttribute.startOffset();
		int endOffset = offsetAttribute.endOffset();
		String term = charTermAttribute.toString();
		sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
	}

	return sb.toString();
}

Source File: DemoTest.java From HongsCORE with MIT License

6 votes

public static void main(String[] args) throws IOException {
    Analyzer az = CustomAnalyzer.builder()
        //.withTokenizer("Standard")
        .withTokenizer("Name")
        .addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
        //.addTokenFilter("ICUTransform", "id", "Han-Latin;NFD;[[:NonspacingMark:][:Space:]] Remove")
        //.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
        .build();

    StringReader      sr = new StringReader(args[0]);
    TokenStream       ts = az.tokenStream  ("" , sr);
    OffsetAttribute   oa = ts.addAttribute (OffsetAttribute.class);
    CharTermAttribute ta = ts.addAttribute (CharTermAttribute.class);

    try {
        ts.reset(); // Resets this stream to the beginning. (Required)
        while (ts.incrementToken()) {
            System.out.println(ta.toString() + "|" + ta.length()
                    + "[" + oa.startOffset() + "," + oa.endOffset() + "]");
        }
        ts.end(  ); // Perform end-of-stream operations, e.g. set the final offset.
    } finally {
        ts.close(); // Release resources associated with this stream.
    }

}

Source File: AutoPhrasingTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

private void emit(char[] tokenChars) {
    char[] token = tokenChars;
    if (replaceWhitespaceWith != null) {
        token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    if (termAttr != null) {
        termAttr.setEmpty();
        termAttr.append(new StringBuilder().append(token));
    }
    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
        int start = offAttr.endOffset() - token.length;
        offAttr.setOffset(start, offAttr.endOffset());
    }
    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
        pia.setPositionIncrement(++positionIncr);
    }
    lastEmitted = token;
}

Source File: ConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

6 votes

@Override
public boolean incrementToken() throws IOException {
  boolean newSource = false;
  while (sources[currentSource].incrementToken() == false) {
    if (currentSource >= sources.length - 1)
      return false;
    sources[currentSource].end();
    initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
    OffsetAttribute att = sourceOffsets[currentSource];
    if (att != null)
      offsetIncrement += att.endOffset();
    currentSource++;
    newSource = true;
  }

  clearAttributes();
  sources[currentSource].copyTo(this);
  offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
  if (newSource) {
    int posInc = posIncAtt.getPositionIncrement();
    posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
  }

  return true;
}

Source File: PathTokenFilterTest.java From SearchServices with GNU Lesser General Public License v3.0

6 votes

public void testTokenizerReuse() throws IOException
{
    // We should be able to use the same Tokenizer twice.
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);

    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    
    // First use
    tokenise(ts, new String[]{"uri1", "one"});
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
    
    // Second use
    final String path2 = "/{uri1}one/uri2:two/";
    StringReader reader2 = new StringReader(path2);
    ts.setReader(reader2);
    tokenise(ts, new String[]{"uri1", "one", "uri2", "two"});
    assertEquals(path2.length(), offsetAtt.startOffset());
    assertEquals(path2.length(), offsetAtt.endOffset());
}

Source File: PathTokenFilterTest.java From SearchServices with GNU Lesser General Public License v3.0

6 votes

public void testAttributesAfterStreamEnd() throws IOException
{
    final String path = "uri1:one";
    StringReader reader = new StringReader(path);
    PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR,
            PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT,
            PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
    ts.setReader(reader);

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    
    // PathTokenFilter.end() will be called after all tokens consumed.
    tokenise(ts, new String[]{"uri1", "one"});
    
    // Check attributes cleaned up
    assertEquals("", termAtt.toString());
    assertEquals("word", typeAtt.type()); // the default
    assertEquals(0, posIncAtt.getPositionIncrement());
    // Final offset...
    assertEquals(path.length(), offsetAtt.startOffset());
    assertEquals(path.length(), offsetAtt.endOffset());
}

Source File: HanLPAnalyzerTest.java From hanlp-lucene-plugin with Apache License 2.0

6 votes

public void testCreateComponents() throws Exception
{
    String text = "中华人民共和国很辽阔";
    for (int i = 0; i < text.length(); ++i)
    {
        System.out.print(text.charAt(i) + "" + i + " ");
    }
    System.out.println();
    Analyzer analyzer = new HanLPAnalyzer();
    TokenStream tokenStream = analyzer.tokenStream("field", text);
    tokenStream.reset();
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}

Source File: PinyinAnalysisTest.java From elasticsearch-analysis-lc-pinyin with Artistic License 2.0

6 votes

@Test
public void testSearch() throws IOException {
    LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
    TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");

    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    tokenStream.reset();
    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "重");
    Assert.assertEquals(offsetAttribute.startOffset(), 0);
    Assert.assertEquals(offsetAttribute.endOffset(), 1);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "qing");
    Assert.assertEquals(offsetAttribute.startOffset(), 1);
    Assert.assertEquals(offsetAttribute.endOffset(), 5);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    tokenStream.close();
}

Source File: PinyinFilterTest.java From elasticsearch-analysis-lc-pinyin with Artistic License 2.0

6 votes

public void testFullPinyinFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }

Source File: PinyinFilterTest.java From elasticsearch-analysis-lc-pinyin with Artistic License 2.0

6 votes

public void testFirstLetterFilter() throws IOException {

        LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
        TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠");

        LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter);

        CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class);

        lcPinyinTokenFilter.reset();
        while (lcPinyinTokenFilter.incrementToken()) {
            System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement());
        }
        lcPinyinTokenFilter.close();
    }

Source File: MemoryIndex.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Convenience method; Creates and returns a token stream that generates a
 * token for each keyword in the given collection, "as is", without any
 * transforming text analysis. The resulting token stream can be fed into
 * {@link #addField(String, TokenStream)}, perhaps wrapped into another
 * {@link org.apache.lucene.analysis.TokenFilter}, as desired.
 * 
 * @param keywords
 *            the keywords to generate tokens for
 * @return the corresponding token stream
 */
public <T> TokenStream keywordTokenStream(final Collection<T> keywords) {
  // TODO: deprecate & move this method into AnalyzerUtil?
  if (keywords == null)
    throw new IllegalArgumentException("keywords must not be null");
  
  return new TokenStream() {
    private Iterator<T> iter = keywords.iterator();
    private int start = 0;
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    @Override
    public boolean incrementToken() {
      if (!iter.hasNext()) return false;
      
      T obj = iter.next();
      if (obj == null) 
        throw new IllegalArgumentException("keyword must not be null");
      
      String term = obj.toString();
      clearAttributes();
      termAtt.setEmpty().append(term);
      offsetAtt.setOffset(start, start+termAtt.length());
      start += term.length() + 1; // separate words by 1 (blank) character
      return true;
    }
  };
}

Source File: HighlighterTest.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TokenStreamComponents createComponents(String arg0) {
  Tokenizer stream = new MockTokenizer(MockTokenizer.SIMPLE, true);
  stream.addAttribute(CharTermAttribute.class);
  stream.addAttribute(PositionIncrementAttribute.class);
  stream.addAttribute(OffsetAttribute.class);
  return new TokenStreamComponents(stream, new SynonymTokenizer(stream, synonyms));
}

Source File: TestToken.java From jstarcraft-nlp with Apache License 2.0

5 votes

public static void main(String[] args) {

//        SynonymsLibrary.put(SynonymsLibrary.DEFAULT, "../../library/synonyms.dic");
//
//        DicLibrary.insert(DicLibrary.DEFAULT, "清华", "n", 2000);
//        DicLibrary.insert(DicLibrary.DEFAULT, "大学", "n", 2000);

        Map<String, String> map = new HashMap<String, String>();

        map.put("type", "base_ansj");
//        map.put(SynonymsLibrary.DEFAULT, SynonymsLibrary.DEFAULT);

        Analyzer ca = new AnsjAnalyzer(map);

        String content = "我爱北京天安门天安门上太阳升我美丽的清华大学";

        try {
            TokenStream tokenStream = ca.tokenStream(content, new StringReader(content));

            while (tokenStream.incrementToken()) {

                System.out.print(tokenStream.getAttribute(CharTermAttribute.class));
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
                System.out.print("\t");
                System.out.print(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                System.out.print("\t");
                System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());

            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ca.close();
    }

Source File: MeCabKoTokenizer.java From mecab-ko-lucene-analyzer with Apache License 2.0

5 votes

private void setAttributes() {
  charTermAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  posLenAtt = addAttribute(PositionLengthAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  posAtt = addAttribute(PartOfSpeechAttribute.class);
  semanticClassAtt = addAttribute(SemanticClassAttribute.class);
}

Source File: PreAnalyzedField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }
  
  AttributeSource.State state = it.next();
  restoreState(state.clone());
  // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
  lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
  return true;
}

Source File: IKTokenizer.java From IKAnalyzer with Apache License 2.0

5 votes

/**
 * Lucene 3.5 Tokenizer适配器类构造函数
 *
 * @param in a {@link java.io.Reader} object.
 * @param useSmart a boolean.
 */
public IKTokenizer(Reader in , boolean useSmart){
    super(in);
    offsetAtt = addAttribute(OffsetAttribute.class);
    termAtt = addAttribute(CharTermAttribute.class);
	_IKImplement = new IKSegmenter(in , useSmart);
}

Source File: ConditionalTokenFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void end() throws IOException {
  if (endState == null) {
    super.end();
    endState = captureState();
  }
  else {
    restoreState(endState);
  }
  endOffset = getAttribute(OffsetAttribute.class).endOffset();
  if (lastTokenFiltered) {
    this.delegate.end();
    endState = captureState();
  }
}

org.apache.lucene.analysis.tokenattributes.OffsetAttribute Java Examples