Java Code Examples for org.apdplat.word.segmentation.SegmentationFactory#getSegmentation()

The following examples show how to use org.apdplat.word.segmentation.SegmentationFactory#getSegmentation() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TextSimilarity.java    From word with Apache License 2.0 6 votes vote down vote up
/**
 * 对文本进行分词
 * @param text 文本
 * @return 分词结果
 */
private List<Word> seg(String text){
    if(text == null){
        return Collections.emptyList();
    }
    if(segmentation == null){
        //延迟初始化
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore);
    }
    List<Word> words = segmentation.seg(text);
    if(filterStopWord) {
        //停用词过滤
        StopWord.filterStopWords(words);
    }
    return words;
}
 
Example 2
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordTokenizer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example 3
Source File: ChineseWordTokenizerFactory.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordTokenizerFactory(Map<String, String> args){
    super(args);
    if(args != null){
        String conf = args.get("conf");
        if(conf != null){
            //强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        }else{
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = args.get("segAlgorithm");
        if(algorithm != null){
            try{
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法:"+algorithm);
            }catch(Exception e){
                LOGGER.error("参数segAlgorithm指定的值错误:"+algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有:");
                for(SegmentationAlgorithm sa : SegmentationAlgorithm.values()){
                    LOGGER.error("\t"+sa.name());
                }
            }
        }else{
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if(segmentation == null){
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法:"+SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}
 
Example 4
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordTokenizer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example 5
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordAnalyzer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example 6
Source File: WordSegmentFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public Segmentation build(Map<String, String> configurations) {
    for (Entry<String, String> keyValue : configurations.entrySet()) {
        String key = keyValue.getKey();
        String value = keyValue.getValue();
        WordConfTools.set(key, value);
    }

    String algorithm = get(configurations, "algorithm", "FullSegmentation");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(algorithm));
    return segmentation;
}
 
Example 7
Source File: WordSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected Tokenizer getSegmenter() {
    // 可以配置到word.local.conf
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    WordTokenizer tokenizer = new WordTokenizer(segmentation);
    return tokenizer;
}
 
Example 8
Source File: WordTokenizerTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected NlpTokenizer<? extends NlpToken> getTokenizer() {
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    return new WordTokenizer(segmentation);
}
 
Example 9
Source File: ChineseWordAnalyzer.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordAnalyzer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example 10
Source File: TextSimilarity.java    From word with Apache License 2.0 4 votes vote down vote up
public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm){
    segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
    LOGGER.info("设置分词算法为:"+segmentationAlgorithm.getDes());
}
 
Example 11
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example 12
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example 13
Source File: ChineseWordAnalyzer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example 14
Source File: Utils.java    From word with Apache License 2.0 4 votes vote down vote up
/**
 *
 * 对文件进行分词
 * @param input 输入文件
 * @param output 输出文件
 * @param removeStopWords 是否移除停用词
 * @param segmentationAlgorithm 分词算法
 * @param fileSegmentationCallback 分词结果回调
 * @throws Exception
 */
public static void seg(File input, File output, boolean removeStopWords, SegmentationAlgorithm segmentationAlgorithm, FileSegmentationCallback fileSegmentationCallback) throws Exception{
    LOGGER.info("开始对文件进行分词:"+input.toString());
    Segmentation segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
    float max=(float)Runtime.getRuntime().maxMemory()/1000000;
    float total=(float)Runtime.getRuntime().totalMemory()/1000000;
    float free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String pre="执行之前剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    //准备输出目录
    if(!output.getParentFile().exists()){
        output.getParentFile().mkdirs();
    }
    try(BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(input),"utf-8"));
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output),"utf-8"))){
        long size = Files.size(input.toPath());
        LOGGER.info("size:"+size);
        LOGGER.info("文件大小:"+(float)size/1024/1024+" MB");
        int textLength=0;
        int progress=0;
        long start = System.currentTimeMillis();
        String line = null;
        while((line = reader.readLine()) != null){
            if("".equals(line.trim())){
                writer.write("\n");
                continue;
            }
            textLength += line.length();
            List<Word> words = segmentation.seg(line);
            if(removeStopWords){
                //停用词过滤
                StopWord.filterStopWords(words);
            }
            if(words == null){
                continue;
            }
            for(Word word : words){
                if(fileSegmentationCallback != null) {
                    fileSegmentationCallback.callback(word);
                }
                writer.write(word.getText()+" ");
            }
            writer.write("\n");
            progress += line.length();
            if( progress > 500000){
                progress = 0;
                LOGGER.info("分词进度:"+(int)((float)textLength*2/size*100)+"%");
            }
        }
        long cost = System.currentTimeMillis() - start;
        float rate = textLength/cost;
        LOGGER.info("字符数目:"+textLength);
        LOGGER.info("分词耗时:"+getTimeDes(cost)+" 毫秒");
        LOGGER.info("分词速度:"+rate+" 字符/毫秒");
    }
    max=(float)Runtime.getRuntime().maxMemory()/1000000;
    total=(float)Runtime.getRuntime().totalMemory()/1000000;
    free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String post="执行之后剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    LOGGER.info(pre);
    LOGGER.info(post);
    LOGGER.info("将文件 "+input.toString()+" 的分词结果保存到文件 "+output);
}
 
Example 15
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example 16
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example 17
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordAnalyzer() {
    this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example 18
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example 19
Source File: WordFrequencyStatistics.java    From word with Apache License 2.0 2 votes vote down vote up
/**
 * 设置分词算法
 * @param segmentationAlgorithm 分词算法
 */
public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example 20
Source File: WordFrequencyStatistics.java    From word with Apache License 2.0 2 votes vote down vote up
/**
 * 构造函数
 * @param resultPath 词频统计结果保存路径
 * @param segmentationAlgorithm 分词算法,要符合 org.apdplat.word.segmentation.SegmentationAlgorithm 中的定义
 */
public WordFrequencyStatistics(String resultPath, String segmentationAlgorithm){
    this.resultPath = resultPath;
    this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(segmentationAlgorithm));
}