Search in sources :

Example 1 with ToAnalysis

use of org.ansj.splitWord.analysis.ToAnalysis in project ansj_seg by NLPchina.

the class AppTest method main.

public static void main(String[] args) throws IOException {
    String stopDicStr = "6\n7\n龙";
    StopRecognition testFilter = new StopRecognition();
    BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
    String temp = null;
    while ((temp = br.readLine()) != null) {
        testFilter.insertStopWords(temp);
    }
    List<StopRecognition> filters = new ArrayList<StopRecognition>();
    filters.add(testFilter);
    for (int i = 0; i < 1; i++) {
        StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
        parse(new IndexAnalysis(reader), filters);
        parse(new ToAnalysis(reader), filters);
        parse(new DicAnalysis(reader), filters);
        parse(new NlpAnalysis(reader), filters);
        parse(new BaseAnalysis(reader), filters);
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis)

Example 2 with ToAnalysis

use of org.ansj.splitWord.analysis.ToAnalysis in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 3 with ToAnalysis

use of org.ansj.splitWord.analysis.ToAnalysis in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 4 with ToAnalysis

use of org.ansj.splitWord.analysis.ToAnalysis in project ansj_seg by NLPchina.

the class SpeedTest method main.

public static void main(String[] args) throws IOException {
    ToAnalysis.parse("test---aaaa中国孙健测试");
    BufferedReader reader = IOUtil.getReader("/home/ansj/data/allSportsArticle", IOUtil.UTF8);
    long start = System.currentTimeMillis();
    long allCount = 0;
    //		for (int j = 0; j < 1; j++) {
    //			for (String string : all) {
    //				allCount += string.length();
    //				ToAnalysis.parse(string);
    //			}
    //		}
    //		String temp = null ;
    //		while((temp=reader.readLine())!=null){
    //			GetWordsImpl gwi = new GetWordsImpl(temp) ;
    //			allCount += temp.length() ;
    //			while((gwi.allWords())!=null){
    //				
    //			}
    //		}
    ToAnalysis toAnalysis = new ToAnalysis(IOUtil.getReader("/home/ansj/data/allSportsArticle", IOUtil.UTF8));
    Term term = null;
    while ((term = toAnalysis.next()) != null) {
        allCount += term.getName().length();
    }
    long end = System.currentTimeMillis();
    System.out.println(start - end);
    System.out.println("共 " + allCount + " 个字符,每秒处理了:" + (allCount * 1000 / (end - start)));
}
Also used : BufferedReader(java.io.BufferedReader) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Term(org.ansj.domain.Term)

Example 5 with ToAnalysis

use of org.ansj.splitWord.analysis.ToAnalysis in project ansj_seg by NLPchina.

the class AppTest method main.

public static void main(String[] args) throws IOException {
    String stopDicStr = "6\n7\n龙";
    StopRecognition testFilter = new StopRecognition();
    BufferedReader br = new BufferedReader(new StringReader(stopDicStr));
    String temp = null;
    while ((temp = br.readLine()) != null) {
        testFilter.insertStopWords(temp);
    }
    List<StopRecognition> filters = new ArrayList<StopRecognition>();
    filters.add(testFilter);
    for (int i = 0; i < 1; i++) {
        StringReader reader = new StringReader("龙虎胶囊 6 * 7cm");
        parse(new IndexAnalysis(reader), filters);
        parse(new ToAnalysis(reader), filters);
        parse(new DicAnalysis(reader), filters);
        parse(new NlpAnalysis(reader), filters);
        parse(new BaseAnalysis(reader), filters);
    }
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis)

Aggregations

ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)5 ArrayList (java.util.ArrayList)4 StopRecognition (org.ansj.recognition.impl.StopRecognition)4 BaseAnalysis (org.ansj.splitWord.analysis.BaseAnalysis)4 DicAnalysis (org.ansj.splitWord.analysis.DicAnalysis)4 IndexAnalysis (org.ansj.splitWord.analysis.IndexAnalysis)4 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)4 BufferedReader (java.io.BufferedReader)3 StringReader (java.io.StringReader)2 List (java.util.List)2 AnsjTokenizer (org.ansj.lucene.util.AnsjTokenizer)2 SynonymsRecgnition (org.ansj.recognition.impl.SynonymsRecgnition)2 Analysis (org.ansj.splitWord.Analysis)2 Forest (org.nlpcn.commons.lang.tire.domain.Forest)2 SmartForest (org.nlpcn.commons.lang.tire.domain.SmartForest)2 Term (org.ansj.domain.Term)1