Search in sources :

Example 1 with StopRecognition

use of org.ansj.recognition.impl.StopRecognition in project ansj_seg by NLPchina.

the class AppTest method main.

public static void main(String[] args) throws IOException {
    String stopDicStr = "6\n7\n龙";
    StopRecognition testFilter = new StopRecognition();
    BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
    String temp = null;
    while ((temp = br.readLine()) != null) {
        testFilter.insertStopWords(temp);
    }
    List<StopRecognition> filters = new ArrayList<StopRecognition>();
    filters.add(testFilter);
    for (int i = 0; i < 1; i++) {
        StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
        parse(new IndexAnalysis(reader), filters);
        parse(new ToAnalysis(reader), filters);
        parse(new DicAnalysis(reader), filters);
        parse(new NlpAnalysis(reader), filters);
        parse(new BaseAnalysis(reader), filters);
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis)

Example 2 with StopRecognition

use of org.ansj.recognition.impl.StopRecognition in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 3 with StopRecognition

use of org.ansj.recognition.impl.StopRecognition in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 4 with StopRecognition

use of org.ansj.recognition.impl.StopRecognition in project ansj_seg by NLPchina.

the class StopLibrary method get.

/**
	 * 根据模型名称获取crf模型
	 * 
	 * @param modelName
	 * @return
	 */
public static StopRecognition get(String key) {
    KV<String, StopRecognition> kv = STOP.get(key);
    if (kv == null) {
        if (MyStaticValue.ENV.containsKey(key)) {
            putIfAbsent(key, MyStaticValue.ENV.get(key));
            return get(key);
        }
        LOG.warn("STOP " + key + " not found in config ");
        return null;
    }
    StopRecognition stopRecognition = kv.getV();
    if (stopRecognition == null) {
        stopRecognition = init(key, kv);
    }
    return stopRecognition;
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition)

Example 5 with StopRecognition

use of org.ansj.recognition.impl.StopRecognition in project ansj_seg by NLPchina.

the class StopLibrary method init.

/**
	 * 用户自定义词典加载
	 * 
	 * @param key
	 * @param path
	 * @return
	 */
private static synchronized StopRecognition init(String key, KV<String, StopRecognition> kv) {
    StopRecognition stopRecognition = kv.getV();
    if (stopRecognition != null) {
        return stopRecognition;
    }
    try {
        stopRecognition = new StopRecognition();
        LOG.debug("begin init FILTER !");
        long start = System.currentTimeMillis();
        String temp = null;
        String[] strs = null;
        try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "UTF-8")) {
            while ((temp = br.readLine()) != null) {
                if (StringUtil.isNotBlank(temp)) {
                    temp = StringUtil.trim(temp);
                    strs = temp.split("\t");
                    if (strs.length == 1) {
                        stopRecognition.insertStopWords(strs[0]);
                    } else {
                        switch(strs[1]) {
                            case "nature":
                                stopRecognition.insertStopNatures(strs[0]);
                                break;
                            case "regex":
                                stopRecognition.insertStopRegexes(strs[0]);
                                break;
                            default:
                                stopRecognition.insertStopWords(strs[0]);
                                break;
                        }
                    }
                }
            }
        }
        LOG.info("load stop use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
        kv.setV(stopRecognition);
        return stopRecognition;
    } catch (Exception e) {
        LOG.error("Init Stop library error :" + e.getMessage() + ", path: " + kv.getK());
        STOP.remove(key);
        return null;
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) BufferedReader(java.io.BufferedReader)

Aggregations

StopRecognition (org.ansj.recognition.impl.StopRecognition)10 ArrayList (java.util.ArrayList)4 BaseAnalysis (org.ansj.splitWord.analysis.BaseAnalysis)4 DicAnalysis (org.ansj.splitWord.analysis.DicAnalysis)4 IndexAnalysis (org.ansj.splitWord.analysis.IndexAnalysis)4 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)4 ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)4 BufferedReader (java.io.BufferedReader)3 StringReader (java.io.StringReader)2 List (java.util.List)2 AnsjTokenizer (org.ansj.lucene.util.AnsjTokenizer)2 SynonymsRecgnition (org.ansj.recognition.impl.SynonymsRecgnition)2 Analysis (org.ansj.splitWord.Analysis)2 Forest (org.nlpcn.commons.lang.tire.domain.Forest)2 SmartForest (org.nlpcn.commons.lang.tire.domain.SmartForest)2