org.apdplat.word.segmentation.SegmentationFactory#getSegmentation

Source File: TextSimilarity.java From word with Apache License 2.0

6 votes

/**
 * 对文本进行分词
 * @param text 文本
 * @return 分词结果
 */
private List<Word> seg(String text){
    if(text == null){
        return Collections.emptyList();
    }
    if(segmentation == null){
        //延迟初始化
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore);
    }
    List<Word> words = segmentation.seg(text);
    if(filterStopWord) {
        //停用词过滤
        StopWord.filterStopWords(words);
    }
    return words;
}

Source File: WordTokenizer.java From jstarcraft-nlp with Apache License 2.0

5 votes

public WordTokenizer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}

Source File: ChineseWordTokenizerFactory.java From word with Apache License 2.0

5 votes

public ChineseWordTokenizerFactory(Map<String, String> args){
    super(args);
    if(args != null){
        String conf = args.get("conf");
        if(conf != null){
            //强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        }else{
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = args.get("segAlgorithm");
        if(algorithm != null){
            try{
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法："+algorithm);
            }catch(Exception e){
                LOGGER.error("参数segAlgorithm指定的值错误："+algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有：");
                for(SegmentationAlgorithm sa : SegmentationAlgorithm.values()){
                    LOGGER.error("\t"+sa.name());
                }
            }
        }else{
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if(segmentation == null){
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法："+SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}

Source File: ChineseWordTokenizer.java From word with Apache License 2.0

5 votes

public ChineseWordTokenizer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}

Source File: WordAnalyzer.java From jstarcraft-nlp with Apache License 2.0

5 votes

public WordAnalyzer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}

Source File: WordSegmentFactory.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
public Segmentation build(Map<String, String> configurations) {
    for (Entry<String, String> keyValue : configurations.entrySet()) {
        String key = keyValue.getKey();
        String value = keyValue.getValue();
        WordConfTools.set(key, value);
    }

    String algorithm = get(configurations, "algorithm", "FullSegmentation");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(algorithm));
    return segmentation;
}

Source File: WordSegmenterTestCase.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
protected Tokenizer getSegmenter() {
    // 可以配置到word.local.conf
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    WordTokenizer tokenizer = new WordTokenizer(segmentation);
    return tokenizer;
}

Source File: WordTokenizerTestCase.java From jstarcraft-nlp with Apache License 2.0

5 votes

@Override
protected NlpTokenizer<? extends NlpToken> getTokenizer() {
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    return new WordTokenizer(segmentation);
}

Source File: ChineseWordAnalyzer.java From word with Apache License 2.0

5 votes

public ChineseWordAnalyzer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}

Source File: TextSimilarity.java From word with Apache License 2.0

4 votes

public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm){
    segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
    LOGGER.info("设置分词算法为："+segmentationAlgorithm.getDes());
}

Source File: ChineseWordTokenizer.java From word with Apache License 2.0

4 votes

public ChineseWordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}

Source File: ChineseWordTokenizer.java From word with Apache License 2.0

4 votes

public ChineseWordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}

Source File: ChineseWordAnalyzer.java From word with Apache License 2.0

4 votes

public ChineseWordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}

Source File: Utils.java From word with Apache License 2.0

4 votes

/**
 *
 * 对文件进行分词
 * @param input 输入文件
 * @param output 输出文件
 * @param removeStopWords 是否移除停用词
 * @param segmentationAlgorithm 分词算法
 * @param fileSegmentationCallback 分词结果回调
 * @throws Exception
 */
public static void seg(File input, File output, boolean removeStopWords, SegmentationAlgorithm segmentationAlgorithm, FileSegmentationCallback fileSegmentationCallback) throws Exception{
    LOGGER.info("开始对文件进行分词："+input.toString());
    Segmentation segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
    float max=(float)Runtime.getRuntime().maxMemory()/1000000;
    float total=(float)Runtime.getRuntime().totalMemory()/1000000;
    float free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String pre="执行之前剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    //准备输出目录
    if(!output.getParentFile().exists()){
        output.getParentFile().mkdirs();
    }
    try(BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(input),"utf-8"));
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output),"utf-8"))){
        long size = Files.size(input.toPath());
        LOGGER.info("size:"+size);
        LOGGER.info("文件大小："+(float)size/1024/1024+" MB");
        int textLength=0;
        int progress=0;
        long start = System.currentTimeMillis();
        String line = null;
        while((line = reader.readLine()) != null){
            if("".equals(line.trim())){
                writer.write("\n");
                continue;
            }
            textLength += line.length();
            List<Word> words = segmentation.seg(line);
            if(removeStopWords){
                //停用词过滤
                StopWord.filterStopWords(words);
            }
            if(words == null){
                continue;
            }
            for(Word word : words){
                if(fileSegmentationCallback != null) {
                    fileSegmentationCallback.callback(word);
                }
                writer.write(word.getText()+" ");
            }
            writer.write("\n");
            progress += line.length();
            if( progress > 500000){
                progress = 0;
                LOGGER.info("分词进度："+(int)((float)textLength*2/size*100)+"%");
            }
        }
        long cost = System.currentTimeMillis() - start;
        float rate = textLength/cost;
        LOGGER.info("字符数目："+textLength);
        LOGGER.info("分词耗时："+getTimeDes(cost)+" 毫秒");
        LOGGER.info("分词速度："+rate+" 字符/毫秒");
    }
    max=(float)Runtime.getRuntime().maxMemory()/1000000;
    total=(float)Runtime.getRuntime().totalMemory()/1000000;
    free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String post="执行之后剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    LOGGER.info(pre);
    LOGGER.info(post);
    LOGGER.info("将文件 "+input.toString()+" 的分词结果保存到文件 "+output);
}

Source File: WordTokenizer.java From jstarcraft-nlp with Apache License 2.0

4 votes

public WordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}

Source File: WordAnalyzer.java From jstarcraft-nlp with Apache License 2.0

4 votes

public WordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}

Source File: WordAnalyzer.java From jstarcraft-nlp with Apache License 2.0

4 votes

public WordAnalyzer() {
    this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}

Source File: WordTokenizer.java From jstarcraft-nlp with Apache License 2.0

4 votes

public WordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}

Source File: WordFrequencyStatistics.java From word with Apache License 2.0

2 votes

/**
 * 设置分词算法
 * @param segmentationAlgorithm 分词算法
 */
public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}

Source File: WordFrequencyStatistics.java From word with Apache License 2.0

2 votes

/**
 * 构造函数
 * @param resultPath 词频统计结果保存路径
 * @param segmentationAlgorithm 分词算法，要符合 org.apdplat.word.segmentation.SegmentationAlgorithm 中的定义
 */
public WordFrequencyStatistics(String resultPath, String segmentationAlgorithm){
    this.resultPath = resultPath;
    this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(segmentationAlgorithm));
}

Java Code Examples for org.apdplat.word.segmentation.SegmentationFactory#getSegmentation()