Search in sources :

Example 1 with BaseAnalysis

use of org.ansj.splitWord.analysis.BaseAnalysis in project ansj_seg by NLPchina.

the class AppTest method main.

public static void main(String[] args) throws IOException {
    String stopDicStr = "6\n7\n龙";
    StopRecognition testFilter = new StopRecognition();
    BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
    String temp = null;
    while ((temp = br.readLine()) != null) {
        testFilter.insertStopWords(temp);
    }
    List<StopRecognition> filters = new ArrayList<StopRecognition>();
    filters.add(testFilter);
    for (int i = 0; i < 1; i++) {
        StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
        parse(new IndexAnalysis(reader), filters);
        parse(new ToAnalysis(reader), filters);
        parse(new DicAnalysis(reader), filters);
        parse(new NlpAnalysis(reader), filters);
        parse(new BaseAnalysis(reader), filters);
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis)

Example 2 with BaseAnalysis

use of org.ansj.splitWord.analysis.BaseAnalysis in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 3 with BaseAnalysis

use of org.ansj.splitWord.analysis.BaseAnalysis in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 4 with BaseAnalysis

use of org.ansj.splitWord.analysis.BaseAnalysis in project ansj_seg by NLPchina.

the class FileDemo method main.

public static void main(String[] args) throws IOException {
    //
    // MyStaticValue.isRealName = true;
    BufferedReader reader = IOUtil.getReader("/home/ansj/temp/360baikeData/360tag_all.txt", "utf-8");
    ToAnalysis.parse("test 123 孙");
    Analysis na = new BaseAnalysis(reader);
    long start = System.currentTimeMillis();
    int allCount = 0;
    Term term = null;
    while ((term = na.next()) != null) {
        if (term.getOffe() % 10000 == 0)
            System.out.println(term.getOffe() + "\t" + term.getName());
        allCount += term.getName().length();
        if (allCount > 30000000) {
            break;
        }
    }
    long end = System.currentTimeMillis();
    System.out.println(end - start);
    System.out.println("共 " + allCount + " 个字符,每秒处理了:" + (allCount * 1000.0 / (end - start)));
}
Also used : BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Analysis(org.ansj.splitWord.Analysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) Term(org.ansj.domain.Term)

Example 5 with BaseAnalysis

use of org.ansj.splitWord.analysis.BaseAnalysis in project ansj_seg by NLPchina.

the class AppTest method main.

public static void main(String[] args) throws IOException {
    String stopDicStr = "6\n7\n龙";
    StopRecognition testFilter = new StopRecognition();
    BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
    String temp = null;
    while ((temp = br.readLine()) != null) {
        testFilter.insertStopWords(temp);
    }
    List<StopRecognition> filters = new ArrayList<StopRecognition>();
    filters.add(testFilter);
    for (int i = 0; i < 1; i++) {
        StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
        parse(new IndexAnalysis(reader), filters);
        parse(new ToAnalysis(reader), filters);
        parse(new DicAnalysis(reader), filters);
        parse(new NlpAnalysis(reader), filters);
        parse(new BaseAnalysis(reader), filters);
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis)

Aggregations

BaseAnalysis (org.ansj.splitWord.analysis.BaseAnalysis)5 ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)5 ArrayList (java.util.ArrayList)4 StopRecognition (org.ansj.recognition.impl.StopRecognition)4 DicAnalysis (org.ansj.splitWord.analysis.DicAnalysis)4 IndexAnalysis (org.ansj.splitWord.analysis.IndexAnalysis)4 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)4 BufferedReader (java.io.BufferedReader)3 Analysis (org.ansj.splitWord.Analysis)3 StringReader (java.io.StringReader)2 List (java.util.List)2 AnsjTokenizer (org.ansj.lucene.util.AnsjTokenizer)2 SynonymsRecgnition (org.ansj.recognition.impl.SynonymsRecgnition)2 Forest (org.nlpcn.commons.lang.tire.domain.Forest)2 SmartForest (org.nlpcn.commons.lang.tire.domain.SmartForest)2 Term (org.ansj.domain.Term)1