org.wltea.analyzer.core.Lexeme Java Examples
The following examples show how to use
org.wltea.analyzer.core.Lexeme.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IkTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Override public Iterable<IkToken> tokenize(CharSequence text) { try { segmenter.reset(new StringReader(text.toString())); LinkedList<Lexeme> iterator = new LinkedList<>(); while (true) { Lexeme lexeme = segmenter.next(); if (lexeme != null) { iterator.add(lexeme); } else { break; } } IkToken iterable = new IkToken(iterator.iterator()); return iterable; } catch (Exception exception) { throw new RuntimeException(exception); } }
Example #2
Source File: StrUtils.java From Lottery with GNU General Public License v2.0 | 6 votes |
/** * * @param keyword 源词汇 * @param smart 是否智能分词 * @return 分词词组(,拼接) */ public static String getKeywords(String keyword, boolean smart) { StringReader reader = new StringReader(keyword); IKSegmenter iks = new IKSegmenter(reader, smart); StringBuilder buffer = new StringBuilder(); try { Lexeme lexeme; while ((lexeme = iks.next()) != null) { buffer.append(lexeme.getLexemeText()).append(','); } } catch (IOException e) { } //去除最后一个, if (buffer.length() > 0) { buffer.setLength(buffer.length() - 1); } return buffer.toString(); }
Example #3
Source File: IKTokenizer.java From ik-analyzer with GNU General Public License v3.0 | 6 votes |
@Override public boolean incrementToken() throws IOException { //清除所有的词元属性 clearAttributes(); Lexeme nextLexeme = ikimplement.next(); if (nextLexeme != null) { //将Lexeme转成Attributes //设置词元文本 termAtt.append(nextLexeme.getLexemeText()); //设置词元长度 termAtt.setLength(nextLexeme.getLength()); //设置词元位移 offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); //记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); //记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); //返会true告知还有下个词元 return true; } //返会false告知词元输出完毕 return false; }
Example #4
Source File: ChineseTokenizer.java From RDMP1 with GNU General Public License v2.0 | 6 votes |
/** * * @Title: segStr * @Description: 返回LinkedHashMap的分词 * @param @param content * @param @return * @return Map<String,Integer> * @throws */ public static Map<String, Long> segStr(String content){ // 分词 Reader input = new StringReader(content); // 智能分词关闭(对分词的精度影响很大) IKSegmenter iks = new IKSegmenter(input, true); Lexeme lexeme = null; Map<String, Long> words = new LinkedHashMap<String, Long>(); try { while ((lexeme = iks.next()) != null) { if (words.containsKey(lexeme.getLexemeText())) { words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1); } else { words.put(lexeme.getLexemeText(), 1L); } } }catch(IOException e) { e.printStackTrace(); } return words; }
Example #5
Source File: IKTokenizer.java From IKAnalyzer with Apache License 2.0 | 6 votes |
/** {@inheritDoc} */ @Override public boolean incrementToken() throws IOException { //清除所有的词元属性 clearAttributes(); Lexeme nextLexeme = _IKImplement.next(); if(nextLexeme != null){ //将Lexeme转成Attributes //设置词元文本 termAtt.append(nextLexeme.getLexemeText()); //设置词元长度 termAtt.setLength(nextLexeme.getLength()); //设置词元位移 offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); //记录分词的最后位置 finalOffset = nextLexeme.getEndPosition(); //返会true告知还有下个词元 return true; } //返会false告知词元输出完毕 return false; }
Example #6
Source File: IKTokenizer.java From es-ik with Apache License 2.0 | 6 votes |
@Override public boolean incrementToken() throws IOException { //清除所有的词元属性 clearAttributes(); Lexeme nextLexeme = _IKImplement.next(); if (nextLexeme != null) { //将Lexeme转成Attributes //设置词元文本 termAtt.append(nextLexeme.getLexemeText()); //设置词元长度 termAtt.setLength(nextLexeme.getLength()); //设置词元位移 offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); //记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); //记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); //返会true告知还有下个词元 return true; } //返会false告知词元输出完毕 return false; }
Example #7
Source File: TokenizerAnalyzerUtils.java From JewelCrawler with GNU General Public License v3.0 | 6 votes |
public static String getAnalyzerResult(String input) { StringReader sr=new StringReader(input); IKSegmenter ik=new IKSegmenter(sr, true);//true is use smart Lexeme lex=null; List<String> stopWordsList = getStopWordsList(); StringBuilder stringBuilder = new StringBuilder(); try { while((lex=ik.next())!=null){ if(stopWordsList.contains(lex.getLexemeText())) { continue; } stringBuilder.append(lex.getLexemeText() + Constants.BLANKSPACE); } } catch (IOException e) { e.printStackTrace(); System.out.println("failed to parse input content"); } return stringBuilder.toString(); }
Example #8
Source File: IKAnalyzer.java From hugegraph with Apache License 2.0 | 6 votes |
@Override public Set<String> segment(String text) { Set<String> result = InsertionOrderUtil.newSet(); IKSegmenter ik = new IKSegmenter(new StringReader(text), this.smartSegMode); try { Lexeme word = null; while ((word = ik.next()) != null) { result.add(word.getLexemeText()); } } catch (Exception e) { throw new HugeException("IKAnalyzer segment text '%s' failed", e, text); } return result; }
Example #9
Source File: IkTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Override public boolean incrementToken() throws IOException { // 清除所有的词元属性 clearAttributes(); Lexeme nextLexeme = _IKImplement.next(); if (nextLexeme != null) { // 将Lexeme转成Attributes // 设置词元文本 termAttribute.append(nextLexeme.getLexemeText()); // 设置词元长度 termAttribute.setLength(nextLexeme.getLength()); // 设置词元位移 offsetAttribute.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); // 记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); // 记录词元分类 typeAttribute.setType(nextLexeme.getLexemeTypeString()); // 返会true告知还有下个词元 return true; } // 返会false告知词元输出完毕 return false; }
Example #10
Source File: IKSegmenterTest.java From es-ik with Apache License 2.0 | 5 votes |
private void assertSegmenterCorrect(Lexeme nextLexeme, String lexemeText, int begin, int end, int length, String type) { Assert.assertEquals(nextLexeme.getLexemeText(), lexemeText); Assert.assertEquals(nextLexeme.getBeginPosition(), begin); Assert.assertEquals(nextLexeme.getEndPosition(), end); Assert.assertEquals(nextLexeme.getLength(), length); Assert.assertEquals(nextLexeme.getLexemeTypeString(), type); }
Example #11
Source File: IKSegmenterTest.java From es-ik with Apache License 2.0 | 5 votes |
private void print(Lexeme nextLexeme){ System.out.println(nextLexeme.getLexemeText()); System.out.println(nextLexeme.getBeginPosition()); System.out.println(nextLexeme.getEndPosition()); System.out.println(nextLexeme.getLength()); System.out.println(nextLexeme.getLexemeTypeString()); }
Example #12
Source File: SWMCQueryBuilder.java From IKAnalyzer with Apache License 2.0 | 5 votes |
/** * 生成SWMCQuery * * @param fieldName a {@link java.lang.String} object. * @param keywords a {@link java.lang.String} object. * @param quickMode a boolean. * @return Lucene Query */ public static Query create(String fieldName ,String keywords , boolean quickMode){ if(fieldName == null || keywords == null){ throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); } //1.对keywords进行分词处理 List<Lexeme> lexemes = doAnalyze(keywords); //2.根据分词结果,生成SWMCQuery Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); return _SWMCQuery; }
Example #13
Source File: SWMCQueryBuilder.java From IKAnalyzer with Apache License 2.0 | 5 votes |
/** * 分词切分,并返回结链表 * @param keywords * @return */ private static List<Lexeme> doAnalyze(String keywords){ List<Lexeme> lexemes = new ArrayList<Lexeme>(); IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); try{ Lexeme l = null; while( (l = ikSeg.next()) != null){ lexemes.add(l); } }catch(IOException e){ e.printStackTrace(); } return lexemes; }
Example #14
Source File: IKTokenizer.java From Elasticsearch-Tutorial-zh-CN with GNU General Public License v3.0 | 5 votes |
@Override public boolean incrementToken() throws IOException { //清除所有的词元属性 clearAttributes(); skippedPositions = 0; Lexeme nextLexeme = _IKImplement.next(); if(nextLexeme != null){ posIncrAtt.setPositionIncrement(skippedPositions +1 ); //将Lexeme转成Attributes //设置词元文本 termAtt.append(nextLexeme.getLexemeText()); //设置词元长度 termAtt.setLength(nextLexeme.getLength()); //设置词元位移 offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); //记录分词的最后位置 endPosition = nextLexeme.getEndPosition(); //记录词元分类 typeAtt.setType(nextLexeme.getLexemeTypeString()); //返会true告知还有下个词元 return true; } //返会false告知词元输出完毕 return false; }
Example #15
Source File: SWMCQueryBuilder.java From ik-analyzer with GNU General Public License v3.0 | 5 votes |
/** * 生成SWMCQuery * * @param fieldName * @param keywords * @param quickMode * * @return Lucene Query */ public static Query create(String fieldName, String keywords, boolean quickMode) { if (fieldName == null || keywords == null) { throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); } //1.对keywords进行分词处理 List<Lexeme> lexemes = doAnalyze(keywords); //2.根据分词结果,生成SWMCQuery Query swmcQuery = getSWMCQuery(fieldName, lexemes, quickMode); return swmcQuery; }
Example #16
Source File: SWMCQueryBuilder.java From ik-analyzer with GNU General Public License v3.0 | 5 votes |
/** * 分词切分,并返回结链表 * * @param keywords * * @return */ private static List<Lexeme> doAnalyze(String keywords) { List<Lexeme> lexemes = new ArrayList<Lexeme>(); IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords), true); try { Lexeme l; while ((l = ikSeg.next()) != null) { lexemes.add(l); } } catch (IOException e) { LOG.error("io error.", e); } return lexemes; }
Example #17
Source File: IkToken.java From jstarcraft-nlp with Apache License 2.0 | 4 votes |
public IkToken(Iterator<Lexeme> iterator) { this.iterator = iterator; }