Search in sources :

Example 1 with IndexAnalysis

use of org.ansj.splitWord.analysis.IndexAnalysis in project ansj_seg by NLPchina.

the class AppTest method main.

public static void main(String[] args) throws IOException {
    String stopDicStr = "6\n7\n龙";
    StopRecognition testFilter = new StopRecognition();
    BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
    String temp = null;
    while ((temp = br.readLine()) != null) {
        testFilter.insertStopWords(temp);
    }
    List<StopRecognition> filters = new ArrayList<StopRecognition>();
    filters.add(testFilter);
    for (int i = 0; i < 1; i++) {
        StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
        parse(new IndexAnalysis(reader), filters);
        parse(new ToAnalysis(reader), filters);
        parse(new DicAnalysis(reader), filters);
        parse(new NlpAnalysis(reader), filters);
        parse(new BaseAnalysis(reader), filters);
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis)

Example 2 with IndexAnalysis

use of org.ansj.splitWord.analysis.IndexAnalysis in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 3 with IndexAnalysis

use of org.ansj.splitWord.analysis.IndexAnalysis in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 4 with IndexAnalysis

use of org.ansj.splitWord.analysis.IndexAnalysis in project ansj_seg by NLPchina.

the class AppTest method main.

public static void main(String[] args) throws IOException {
    String stopDicStr = "6\n7\n龙";
    StopRecognition testFilter = new StopRecognition();
    BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
    String temp = null;
    while ((temp = br.readLine()) != null) {
        testFilter.insertStopWords(temp);
    }
    List<StopRecognition> filters = new ArrayList<StopRecognition>();
    filters.add(testFilter);
    for (int i = 0; i < 1; i++) {
        StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
        parse(new IndexAnalysis(reader), filters);
        parse(new ToAnalysis(reader), filters);
        parse(new DicAnalysis(reader), filters);
        parse(new NlpAnalysis(reader), filters);
        parse(new BaseAnalysis(reader), filters);
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis)

Aggregations

ArrayList (java.util.ArrayList)4 StopRecognition (org.ansj.recognition.impl.StopRecognition)4 BaseAnalysis (org.ansj.splitWord.analysis.BaseAnalysis)4 DicAnalysis (org.ansj.splitWord.analysis.DicAnalysis)4 IndexAnalysis (org.ansj.splitWord.analysis.IndexAnalysis)4 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)4 ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)4 BufferedReader (java.io.BufferedReader)2 StringReader (java.io.StringReader)2 List (java.util.List)2 AnsjTokenizer (org.ansj.lucene.util.AnsjTokenizer)2 SynonymsRecgnition (org.ansj.recognition.impl.SynonymsRecgnition)2 Analysis (org.ansj.splitWord.Analysis)2 Forest (org.nlpcn.commons.lang.tire.domain.Forest)2 SmartForest (org.nlpcn.commons.lang.tire.domain.SmartForest)2