com.hankcs.hanlp.seg.common.Term Java Examples
The following examples show how to use
com.hankcs.hanlp.seg.common.Term.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SegmentWrapper.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
public Term next() throws IOException { if (iterator != null && iterator.hasNext()) return iterator.next(); String line = readLine(); if (line == null) return null; List<Term> termList = segment.seg(line); if (termList.size() == 0) return null; for (Term term : termList) { term.offset += offset; } offset += line.length(); iterator = termList.iterator(); return iterator.next(); }
Example #2
Source File: EmailSegment.java From hanlp-lucene-plugin with Apache License 2.0 | 6 votes |
@Override protected List<Term> segSentence(char[] chars) { String text = new String(chars); final Matcher matcher = emailPattern.matcher(text); List<Term> resultList = new ArrayList<>(); while (matcher.find()) { final int start = matcher.start(); resultList.add(new Term(matcher.group(), Nature.nx) {{ offset = start; }}); final String uName = matcher.group(1); resultList.add(new Term(uName, Nature.nx) {{ offset = start; }}); resultList.add(new Term(matcher.group(2), Nature.nx) {{ offset = start; }}); } return resultList; }
Example #3
Source File: TextRankSentence.java From AHANLP with Apache License 2.0 | 6 votes |
private TextRankSentence(String segType, List<String> sentenceList) { this.sentenceList = sentenceList; senWordList = new ArrayList<List<String>>(sentenceList.size()); for(List<Term> senWords : Segment.splitWordInSentences(segType, sentenceList, true)) { senWordList.add(Segment.getWordList(senWords)); } senNum = sentenceList.size(); weight = new float[senNum][senNum]; weight_sum = new float[senNum]; SenRank = new float[senNum]; top = new TreeMap<Float, Integer>(Collections.reverseOrder()); initParam(); calSenRanks(); for (int i = 0; i < senNum; ++i) // 按rank值排序 top.put(SenRank[i], i); }
Example #4
Source File: TextRankSentence.java From AHANLP with Apache License 2.0 | 6 votes |
private TextRankSentence(String segType, String document, String splitReg) { sentenceList = splitSentence(document, splitReg); senWordList = new ArrayList<List<String>>(sentenceList.size()); for(List<Term> senWords : Segment.splitWordInSentences(segType, sentenceList, true)) { senWordList.add(Segment.getWordList(senWords)); } senNum = sentenceList.size(); weight = new float[senNum][senNum]; weight_sum = new float[senNum]; SenRank = new float[senNum]; top = new TreeMap<Float, Integer>(Collections.reverseOrder()); initParam(); calSenRanks(); for (int i = 0; i < senNum; ++i) // 按rank值排序 top.put(SenRank[i], i); }
Example #5
Source File: DependencyParser.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 获得词语依存路径 * @param segResult 分词结果 * @param maxReturn 最大路径长度 * @return 依存路径列表 */ public static List<List<Term>> getWordPaths(List<Term> segResult, int maxReturn) { CoNLLWord[] wordArray = parse(segResult).getWordArray(); List<List<Term>> wordPaths = new ArrayList<List<Term>>(); for (CoNLLWord word : wordArray) wordPaths.add(getWordsInPath(word, maxReturn)); return wordPaths; }
Example #6
Source File: TextRankSummary.java From TextRank with Apache License 2.0 | 5 votes |
/** * 一句话调用接口 * @param document 目标文档 * @param size 需要的关键句的个数 * @return 关键句列表 */ public static List<String> getTopSentenceList(String document, int size) { List<String> sentenceList = spiltSentence(document); List<List<String>> docs = new ArrayList<List<String>>(); for (String sentence : sentenceList) { List<Term> termList = HanLP.segment(sentence); List<String> wordList = new LinkedList<String>(); for (Term term : termList) { if (shouldInclude(term)) { wordList.add(term.word); } } docs.add(wordList); } TextRankSummary textRankSummary = new TextRankSummary(docs); int[] topSentence = textRankSummary.getTopSentence(size); List<String> resultList = new LinkedList<String>(); for (int i : topSentence) { resultList.add(sentenceList.get(i)); } return resultList; }
Example #7
Source File: BaseAction.java From o2oa with GNU Affero General Public License v3.0 | 5 votes |
protected List<String> keys(String key) { List<String> os = new ArrayList<>(); for (Term term : HanLP.segment(key)) { /* 字段不要太长 */ if (StringUtils.length(term.word) < 31) { os.add(StringUtils.lowerCase(term.word)); } } return os; }
Example #8
Source File: DependencyParser.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 依存句法分析 * @param segResult 分词结果 * @param englishTag 使用英语标签 * @return CONLL格式分析结果 */ public static CoNLLSentence parse(List<Term> segResult, boolean englishTag) { IDependencyParser parser = new NeuralNetworkDependencyParser(); if (englishTag) parser.enableDeprelTranslator(false); return parser.parse(segResult); }
Example #9
Source File: POSFilter.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 保留实词 * @param segResult 分词结果 */ public static void selectRealWords(List<Term> segResult) { List<String> selectPOS = Arrays.asList("n", "ns", "nr", "nt", "nz", "v", "vd", "vn", "a", "ad", "an", "d"); for (int i = 0; i < segResult.size(); i++) { if (!selectPOS.contains(segResult.get(i).nature.toString())) { segResult.remove(i); i--; } } }
Example #10
Source File: DKNLPBase.java From dk-fitting with Apache License 2.0 | 5 votes |
/** * 聚类 * * @param documents 待聚类的文档集合,键为文档id,值为文档内容 * @param size 需要得到的类别数量 * @return 类目表, 每个类目内部是一个[文档id]=[相似程度]的列表 */ public static List<List<Map.Entry<String, Double>>> cluster(Map<String, String> documents, int size) { ClusterAnalyzer analyzer = new ClusterAnalyzer(); analyzer.setTokenizer(new ITokenizer() { public String[] segment(final String text) { List<Term> termList = DKNLPBase.segment(text); ListIterator<Term> listIterator = termList.listIterator(); while (listIterator.hasNext()) { if (CoreStopWordDictionary.shouldRemove(listIterator.next())) { listIterator.remove(); } } String[] termArray = new String[termList.size()]; int i = 0; for (Term term : termList) { termArray[i] = term.word; ++i; } return termArray; } }); for (Map.Entry<String, String> entry : documents.entrySet()) { analyzer.addDocument(entry.getKey(), entry.getValue()); } return analyzer.clusters(size); }
Example #11
Source File: Segment.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 分词 * @param segType 分词器类型(Standard 或 NLP) * @param content 文本 * @param filterStopWord 滤掉停用词 * @return 分词结果 */ public static List<Term> segment(String segType, String content, boolean filterStopWord) { List<Term> results = null; if ("Standard".equals(segType) || "标准分词".equals(segType)) { results = StandardSegment(content, filterStopWord); } else if ("NLP".equals(segType) || "NLP分词".equals(segType)) { results = NLPSegment(content, filterStopWord); } else { throw new IllegalArgumentException(String.format("非法参数 segType == %s", segType)); } return results; }
Example #12
Source File: Segment.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 分词断句 * @param segType 分词器类型(Standard 或 NLP) * @param shortest 是否断句为最细的子句(将逗号、分号也视作分隔符) * @param content 文本 * @param filterStopWord 滤掉停用词 * @return 句子列表,每个句子由一个单词列表组成 */ public static List<List<Term>> seg2sentence(String segType, boolean shortest, String content, boolean filterStopWord) { List<List<Term>> results = null; if ("Standard".equals(segType) || "标准分词".equals(segType)) { results = StandardTokenizer.seg2sentence(content, shortest); } else if ("NLP".equals(segType) || "NLP分词".equals(segType)) { results = NLPTokenizer.seg2sentence(content, shortest); } else { throw new IllegalArgumentException(String.format("非法参数 segType == %s", segType)); } if (filterStopWord) for (List<Term> res : results) CoreStopWordDictionary.apply(res); return results; }
Example #13
Source File: Segment.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 获得词语列表 * @param termList 分词结果 * @return 词语列表 */ public static List<String> getWordList(List<Term> termList) { List<String> wordList = new ArrayList<String>(); for (Term term : termList) wordList.add(term.word); return wordList; }
Example #14
Source File: Segment.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 获取词性列表 * @param termList 分词结果 * @return 词性列表 */ public static List<String> getNatureList(List<Term> termList) { List<String> NatureList = new ArrayList<String>(); for (Term term : termList) NatureList.add(term.nature.toString()); return NatureList; }
Example #15
Source File: TextRankKeyword.java From AHANLP with Apache License 2.0 | 5 votes |
/** * 获得分词结果 去除停用词、保留实词 * @param segType 分词器,Standard 或者 NLP * @param content 文本 * @return 词语列表 */ private static List<String> getSegResult(String segType, String content) { List<Term> segResult = null; if ("Standard".equals(segType) || "标准分词".equals(segType)) { segResult = Segment.StandardSegment(content, true); } else if ("NLP".equals(segType) || "NLP分词".equals(segType)) { segResult = Segment.NLPSegment(content, true); } else { throw new IllegalArgumentException(String.format("非法参数 segType == %s", segType)); } POSFilter.selectRealWords(segResult); return Segment.getWordList(segResult); }
Example #16
Source File: SegDemo.java From AHANLP with Apache License 2.0 | 5 votes |
public static void main(String[] args) { String content = "目前,航空母舰主船体完成建造,动力、电力等主要系统设备安装到位。" + "出坞下水是航空母舰建设的重大节点之一,标志着我国自主设计建造航空母舰取得重大阶段性成果。" + "下一步,该航空母舰将按计划进行系统设备调试和舾装施工,并全面开展系泊试验。"; // 标准分词 List<Term> stdSegResult = AHANLP.StandardSegment(content); System.out.println("标准分词:\n" + stdSegResult); // NLP分词 List<Term> nlpSegResult = AHANLP.NLPSegment(content); System.out.println("NLP分词:\n" + nlpSegResult); // 标准分词(去停用词) stdSegResult = AHANLP.StandardSegment(content, true); List<String> stdWordList = AHANLP.getWordList(stdSegResult); System.out.println("标准分词(去停用词):\n" + stdWordList); // NLP分词(去停用词) nlpSegResult = AHANLP.NLPSegment(content, true); List<String> nlpWordList = AHANLP.getWordList(nlpSegResult); System.out.println("NLP分词(去停用词):\n" + nlpWordList); // 标准分词(去停用词,保留实词) stdSegResult = AHANLP.StandardSegment(content, true); //POSFilter.selectRealWords(stdSegResult); POSFilter.selectPOS(stdSegResult, Arrays.asList("n", "ns", "nr", "nt", "nz", "v", "vd", "vn", "a", "ad", "an", "d")); System.out.println("标准分词(去停用词,保留实词):\n" + AHANLP.getWordList(stdSegResult)); // 分词断句 输出句子形式(去停用词) System.out.println("分词断句(去停用词):"); List<List<Term>> results = AHANLP.seg2sentence("Standard", content, true); for (int i = 0; i < results.size(); i++) System.out.println((i + 1) + " : " + AHANLP.getWordList(results.get(i))); // 分句 System.out.println("切分句子:"); List<String> senList = AHANLP.splitSentence(content); for (int i = 0; i < senList.size(); i++) System.out.println((i + 1) + " : " + senList.get(i)); // 对句子列表分词 System.out.println("对句子列表分词(去停用词):"); List<List<Term>> senWordList = AHANLP.splitWordInSentences("Standard", senList, true); for (int i = 0; i < senWordList.size(); i++) System.out.println((i + 1) + " : " + senWordList.get(i)); }
Example #17
Source File: HANLPExtractor.java From Gather-Platform with GNU General Public License v3.0 | 5 votes |
/** * 抽取命名实体 * * @param content 文章正文 * @return map的key是一下三种nr, ns, nt 其value就是对应的词表 */ public Map<String, Set<String>> extractNamedEntity(String content) { List<Term> termList = segment.seg(content); Set<String> nrList = termList.stream().filter(term -> term.nature.startsWith("nr")) .map(term -> term.word).collect(Collectors.toSet()); Set<String> nsList = termList.stream().filter(term -> term.nature.startsWith("ns")) .map(term -> term.word).collect(Collectors.toSet()); Set<String> ntList = termList.stream().filter(term -> term.nature.startsWith("nt")) .map(term -> term.word).collect(Collectors.toSet()); Map<String, Set<String>> namedEntity = Maps.newHashMap(); namedEntity.put("nr", nrList); namedEntity.put("ns", nsList); namedEntity.put("nt", ntList); return namedEntity; }
Example #18
Source File: CoreStopWordDictionary.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
public static void apply(List<Term> termList) { ListIterator listIterator = termList.listIterator(); while (listIterator.hasNext()) { if (shouldRemove((Term)listIterator.next())) { listIterator.remove(); } } }
Example #19
Source File: HanLPTokenizer.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Override final public boolean incrementToken() throws IOException { clearAttributes(); int position = 0; Term term; boolean unIncreased = true; do { term = segment.next(); if (term == null) { totalOffset += segment.offset; return false; } if (TextUtility.isBlank(term.word)) { totalOffset += term.length(); continue; } if (configuration.isEnablePorterStemming() && term.nature == Nature.nx) { term.word = stemmer.stem(term.word); } final Term copyTerm = term; if ((!this.configuration.isEnableStopDictionary()) || (!AccessController.doPrivileged( (PrivilegedAction<Boolean>)() -> CoreStopWordDictionary.shouldRemove(copyTerm)))) { position++; unIncreased = false; } else { totalOffset += term.length(); } } while (unIncreased); positionAttr.setPositionIncrement(position); termAtt.setEmpty().append(term.word); offsetAtt.setOffset(correctOffset(term.offset), correctOffset(term.offset + term.word.length())); typeAtt.setType(term.nature == null ? "null" : term.nature.toString()); totalOffset += term.length(); return true; }
Example #20
Source File: TestSegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Test public void test1() { StringReader reader = new StringReader("张三\n\n\n新买的手机"); SegmentWrapper wrapper = new SegmentWrapper(reader, HanLP.newSegment().enableOffset(true)); while (true) { Term term = wrapper.next(); if (term == null) { break; } System.out.println(term.word + "\t" + term.nature + "\t" + term.offset + "\t" + term.length()); } }
Example #21
Source File: TestSegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Test public void test2() throws IOException { StringReader reader = new StringReader("我的希望是希望张晚霞的背影被晚霞映红"); SegmentWrapper wrapper = new SegmentWrapper(reader, new PerceptronLexicalAnalyzer()); while (true) { Term term = wrapper.next(); if (term == null) { break; } System.out.println(term.word + "\t" + term.nature + "\t" + term.offset + "\t" + term.length()); } }
Example #22
Source File: HanLPTokenizer.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
@Override public boolean incrementToken() throws IOException { clearAttributes(); int position = 0; Term term; boolean un_increased = true; do { term = segment.next(); if (term == null) { break; } if (enablePorterStemming && term.nature == Nature.nx) { term.word = stemmer.stem(term.word); } if (filter != null && filter.containsKey(term.word)) { continue; } else { ++position; un_increased = false; } } while (un_increased); if (term != null) { positionAttr.setPositionIncrement(position); termAtt.setEmpty().append(term.word); offsetAtt.setOffset(correctOffset(totalOffset + term.offset), correctOffset(totalOffset + term.offset + term.word.length())); typeAtt.setType(term.nature == null ? "null" : term.nature.toString()); return true; } else { totalOffset += segment.offset; return false; } }
Example #23
Source File: SegmentWrapper.java From elasticsearch-analysis-hanlp with Apache License 2.0 | 5 votes |
public Term next() throws IOException { if (termArray != null && index < termArray.length) { return termArray[index++]; } if (!scanner.hasNext()) { return null; } String line = scanner.next(); while (isBlank(line)) { if (line == null) { return null; } offset += line.length() + 1; if (scanner.hasNext()) { line = scanner.next(); } else { return null; } } List<Term> termList = segment.seg(line); if (termList.size() == 0) { return null; } termArray = termList.toArray(new Term[0]); for (Term term : termArray) { term.offset += offset; } index = 0; offset += line.length(); return termArray[index++]; }
Example #24
Source File: HANLPExtractor.java From spider with GNU General Public License v3.0 | 5 votes |
/** * 抽取命名实体 * * @param content 文章正文 * @return map的key是一下三种nr, ns, nt 其value就是对应的词表 */ public Map<String, Set<String>> extractNamedEntity(String content) { List<Term> termList = segment.seg(content); Set<String> nrList = termList.stream().filter(term -> term.nature.startsWith("nr")) .map(term -> term.word).collect(Collectors.toSet()); Set<String> nsList = termList.stream().filter(term -> term.nature.startsWith("ns")) .map(term -> term.word).collect(Collectors.toSet()); Set<String> ntList = termList.stream().filter(term -> term.nature.startsWith("nt")) .map(term -> term.word).collect(Collectors.toSet()); Map<String, Set<String>> namedEntity = Maps.newHashMap(); namedEntity.put("nr", nrList); namedEntity.put("ns", nsList); namedEntity.put("nt", ntList); return namedEntity; }
Example #25
Source File: Tokenizer.java From similarity with Apache License 2.0 | 5 votes |
public static List<Word> segment(String sentence) { List<Word> results = new ArrayList<>(); /*// ansj_seg List<org.xm.ansj.domain.Term> termList = StandardSegmentation.parse(sentence).getTerms();//ansj results.addAll(termList .stream() .map(term -> new Word(term.getName(), term.getNature().natureStr)) .collect(Collectors.toList()) );*/ /*//Xmnlp List<org.xm.xmnlp.seg.domain.Term> termList = Xmnlp.segment(sentence); results.addAll(termList .stream() .map(term -> new Word(term.word, term.getNature().name())) .collect(Collectors.toList()) );*/ // HanLP List<Term> termList = HanLP.segment(sentence); results.addAll(termList .stream() .map(term -> new Word(term.word, term.nature.name())) .collect(Collectors.toList()) ); return results; }
Example #26
Source File: Tokenizer.java From similarity with Apache License 2.0 | 5 votes |
public static void fileSegment(Segment segment, String inputFilePath, String outputFilePath) { try { WordFreqStatistics.statistics(segment, inputFilePath); BufferedReader reader = IOUtil.newBufferedReader(inputFilePath); long allCount = 0; long lexCount = 0; long start = System.currentTimeMillis(); String outPath = inputFilePath.replace(".txt", "") + "-Segment-Result.txt"; if (outputFilePath != null && outputFilePath.trim().length() > 0) outPath = outputFilePath; FileOutputStream fos = new FileOutputStream(new File(outPath)); String temp; while ((temp = reader.readLine()) != null) { List<Term> parse = segment.seg(temp); StringBuilder sb = new StringBuilder(); for (Term term : parse) { sb.append(term.toString() + "\t"); if (term.word.trim().length() > 0) { allCount += term.length(); lexCount += 1; } } fos.write(sb.toString().trim().getBytes()); fos.write("\n".getBytes()); } fos.flush(); fos.close(); reader.close(); long end = System.currentTimeMillis(); System.out.println("segment result save:" + outPath); System.out.println("total " + allCount + " chars, " + lexCount + " words, spend" + (end - start) + "ms "); } catch (IOException e) { logger.error("IO error: " + e.getLocalizedMessage()); } }
Example #27
Source File: SegmentWrapper.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
public Term next() throws IOException { if (iterator != null && iterator.hasNext()) return iterator.next(); String line = readLine(); if (line == null) return null; List<Term> termList = segment.seg(line); if (termList.size() == 0) return null; for (Term term : termList) { term.offset += offset; } offset += line.length(); iterator = termList.iterator(); return iterator.next(); }
Example #28
Source File: Parser.java From antiplag with Apache License 2.0 | 5 votes |
public boolean parseFile(File dir, String file) { try { currentFile = file; String[] strs = FileIO.readFile(new File(dir, file),"utf-8"); for(int line=0;line<strs.length;line++) { if(strs[line].trim().length()<1) { //���˵����� continue ; } List<Term> tokens = HanLP.segment(strs[line]); int col = 1; for(int j=0;j<tokens.size();j++) { Term token = tokens.get(j); struct.addToken(new DocToken(token.word, currentFile, line+1, col, token.length(), this)); col = col + tokens.get(j).word.length()+1; } } } catch (Exception e) { getProgram().addError("Parsing Error in '" + file + e.getMessage()); return false; } return true; }
Example #29
Source File: Tokenizer.java From antiplag with Apache License 2.0 | 5 votes |
public static String segment(String text,String sep) { StringBuilder sb = new StringBuilder(); HanLP.Config.Normalization = true; //������->���壬ȫ��->��ǣ���д->Сд�� List<Term> tokens = NotionalTokenizer.segment(text);//�ִʣ�ȥ��ͣ�ô� for(Term token : tokens) { sb.append(token.word+sep); } return sb.toString(); }
Example #30
Source File: HanLpTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override public boolean incrementToken() throws IOException { clearAttributes(); int position = 0; Term term; boolean un_increased = true; do { term = segment.next(); if (term == null) { break; } if (TextUtility.isBlank(term.word)) { // 过滤掉空白符,提高索引效率 continue; } if (filter != null && filter.containsKey(term.word)) { continue; } else { ++position; un_increased = false; } } while (un_increased); if (term != null) { positionAttribute.setPositionIncrement(position); termAttribute.setEmpty().append(term.word); offsetAttribute.setOffset(correctOffset(totalOffset + term.offset), correctOffset(totalOffset + term.offset + term.word.length())); typeAttribute.setType(term.nature == null ? "null" : term.nature.toString()); return true; } else { totalOffset += segment.offset; return false; } }