Java Code Examples for org.apdplat.word.segmentation.SegmentationFactory#getSegmentation()
The following examples show how to use
org.apdplat.word.segmentation.SegmentationFactory#getSegmentation() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TextSimilarity.java From word with Apache License 2.0 | 6 votes |
/** * 对文本进行分词 * @param text 文本 * @return 分词结果 */ private List<Word> seg(String text){ if(text == null){ return Collections.emptyList(); } if(segmentation == null){ //延迟初始化 segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore); } List<Word> words = segmentation.seg(text); if(filterStopWord) { //停用词过滤 StopWord.filterStopWords(words); } return words; }
Example 2
Source File: WordTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
public WordTokenizer(String segmentationAlgorithm) { try { SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm); this.segmentation = SegmentationFactory.getSegmentation(sa); } catch (Exception e) { this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching); } }
Example 3
Source File: ChineseWordTokenizerFactory.java From word with Apache License 2.0 | 5 votes |
public ChineseWordTokenizerFactory(Map<String, String> args){ super(args); if(args != null){ String conf = args.get("conf"); if(conf != null){ //强制覆盖默认配置 WordConfTools.forceOverride(conf); }else{ LOGGER.info("没有指定conf参数"); } String algorithm = args.get("segAlgorithm"); if(algorithm != null){ try{ SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm); segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); LOGGER.info("使用指定分词算法:"+algorithm); }catch(Exception e){ LOGGER.error("参数segAlgorithm指定的值错误:"+algorithm); LOGGER.error("参数segAlgorithm可指定的值有:"); for(SegmentationAlgorithm sa : SegmentationAlgorithm.values()){ LOGGER.error("\t"+sa.name()); } } }else{ LOGGER.info("没有指定segAlgorithm参数"); } } if(segmentation == null){ segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching); LOGGER.info("使用默认分词算法:"+SegmentationAlgorithm.BidirectionalMaximumMatching); } }
Example 4
Source File: ChineseWordTokenizer.java From word with Apache License 2.0 | 5 votes |
public ChineseWordTokenizer(String segmentationAlgorithm) { try{ SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm); this.segmentation = SegmentationFactory.getSegmentation(sa); }catch(Exception e){ this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching); } }
Example 5
Source File: WordAnalyzer.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
public WordAnalyzer(String segmentationAlgorithm) { try { SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm); this.segmentation = SegmentationFactory.getSegmentation(sa); } catch (Exception e) { this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching); } }
Example 6
Source File: WordSegmentFactory.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override public Segmentation build(Map<String, String> configurations) { for (Entry<String, String> keyValue : configurations.entrySet()) { String key = keyValue.getKey(); String value = keyValue.getValue(); WordConfTools.set(key, value); } String algorithm = get(configurations, "algorithm", "FullSegmentation"); Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(algorithm)); return segmentation; }
Example 7
Source File: WordSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override protected Tokenizer getSegmenter() { // 可以配置到word.local.conf // 保持标点符号 WordConfTools.set("keep.punctuation", "true"); // 保持空格 WordConfTools.set("keep.whitespace", "true"); Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation); WordTokenizer tokenizer = new WordTokenizer(segmentation); return tokenizer; }
Example 8
Source File: WordTokenizerTestCase.java From jstarcraft-nlp with Apache License 2.0 | 5 votes |
@Override protected NlpTokenizer<? extends NlpToken> getTokenizer() { // 保持标点符号 WordConfTools.set("keep.punctuation", "true"); // 保持空格 WordConfTools.set("keep.whitespace", "true"); Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation); return new WordTokenizer(segmentation); }
Example 9
Source File: ChineseWordAnalyzer.java From word with Apache License 2.0 | 5 votes |
public ChineseWordAnalyzer(String segmentationAlgorithm) { try{ SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm); this.segmentation = SegmentationFactory.getSegmentation(sa); }catch(Exception e){ this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching); } }
Example 10
Source File: TextSimilarity.java From word with Apache License 2.0 | 4 votes |
public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm){ segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); LOGGER.info("设置分词算法为:"+segmentationAlgorithm.getDes()); }
Example 11
Source File: ChineseWordTokenizer.java From word with Apache License 2.0 | 4 votes |
public ChineseWordTokenizer(SegmentationAlgorithm segmentationAlgorithm) { this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); }
Example 12
Source File: ChineseWordTokenizer.java From word with Apache License 2.0 | 4 votes |
public ChineseWordTokenizer() { segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching); }
Example 13
Source File: ChineseWordAnalyzer.java From word with Apache License 2.0 | 4 votes |
public ChineseWordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) { this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); }
Example 14
Source File: Utils.java From word with Apache License 2.0 | 4 votes |
/** * * 对文件进行分词 * @param input 输入文件 * @param output 输出文件 * @param removeStopWords 是否移除停用词 * @param segmentationAlgorithm 分词算法 * @param fileSegmentationCallback 分词结果回调 * @throws Exception */ public static void seg(File input, File output, boolean removeStopWords, SegmentationAlgorithm segmentationAlgorithm, FileSegmentationCallback fileSegmentationCallback) throws Exception{ LOGGER.info("开始对文件进行分词:"+input.toString()); Segmentation segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); float max=(float)Runtime.getRuntime().maxMemory()/1000000; float total=(float)Runtime.getRuntime().totalMemory()/1000000; float free=(float)Runtime.getRuntime().freeMemory()/1000000; String pre="执行之前剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free); //准备输出目录 if(!output.getParentFile().exists()){ output.getParentFile().mkdirs(); } try(BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(input),"utf-8")); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output),"utf-8"))){ long size = Files.size(input.toPath()); LOGGER.info("size:"+size); LOGGER.info("文件大小:"+(float)size/1024/1024+" MB"); int textLength=0; int progress=0; long start = System.currentTimeMillis(); String line = null; while((line = reader.readLine()) != null){ if("".equals(line.trim())){ writer.write("\n"); continue; } textLength += line.length(); List<Word> words = segmentation.seg(line); if(removeStopWords){ //停用词过滤 StopWord.filterStopWords(words); } if(words == null){ continue; } for(Word word : words){ if(fileSegmentationCallback != null) { fileSegmentationCallback.callback(word); } writer.write(word.getText()+" "); } writer.write("\n"); progress += line.length(); if( progress > 500000){ progress = 0; LOGGER.info("分词进度:"+(int)((float)textLength*2/size*100)+"%"); } } long cost = System.currentTimeMillis() - start; float rate = textLength/cost; LOGGER.info("字符数目:"+textLength); LOGGER.info("分词耗时:"+getTimeDes(cost)+" 毫秒"); LOGGER.info("分词速度:"+rate+" 字符/毫秒"); } max=(float)Runtime.getRuntime().maxMemory()/1000000; total=(float)Runtime.getRuntime().totalMemory()/1000000; free=(float)Runtime.getRuntime().freeMemory()/1000000; String post="执行之后剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free); LOGGER.info(pre); LOGGER.info(post); LOGGER.info("将文件 "+input.toString()+" 的分词结果保存到文件 "+output); }
Example 15
Source File: WordTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 4 votes |
public WordTokenizer() { segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching); }
Example 16
Source File: WordAnalyzer.java From jstarcraft-nlp with Apache License 2.0 | 4 votes |
public WordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) { this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); }
Example 17
Source File: WordAnalyzer.java From jstarcraft-nlp with Apache License 2.0 | 4 votes |
public WordAnalyzer() { this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching); }
Example 18
Source File: WordTokenizer.java From jstarcraft-nlp with Apache License 2.0 | 4 votes |
public WordTokenizer(SegmentationAlgorithm segmentationAlgorithm) { this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); }
Example 19
Source File: WordFrequencyStatistics.java From word with Apache License 2.0 | 2 votes |
/** * 设置分词算法 * @param segmentationAlgorithm 分词算法 */ public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm) { this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm); }
Example 20
Source File: WordFrequencyStatistics.java From word with Apache License 2.0 | 2 votes |
/** * 构造函数 * @param resultPath 词频统计结果保存路径 * @param segmentationAlgorithm 分词算法,要符合 org.apdplat.word.segmentation.SegmentationAlgorithm 中的定义 */ public WordFrequencyStatistics(String resultPath, String segmentationAlgorithm){ this.resultPath = resultPath; this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(segmentationAlgorithm)); }