Search in sources :

Example 1 with SmartForest

use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 2 with SmartForest

use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.

the class TagContent method tagContent.

public String tagContent(List<Keyword> keyWords, String content) {
    SmartForest<Double> sf = new SmartForest<Double>();
    for (Keyword keyWord : keyWords) {
        sf.add(keyWord.getName().toLowerCase(), keyWord.getScore());
    }
    SmartGetWord<Double> sgw = new SmartGetWord<Double>(sf, content.toLowerCase());
    int beginOffe = 0;
    String temp = null;
    StringBuilder sb = new StringBuilder();
    while ((temp = sgw.getFrontWords()) != null) {
        sb.append(content.substring(beginOffe, sgw.offe));
        sb.append(beginTag);
        sb.append(content.substring(sgw.offe, sgw.offe + temp.length()));
        sb.append(endTag);
        beginOffe = sgw.offe + temp.length();
    }
    if (beginOffe <= content.length() - 1) {
        sb.append(content.substring(beginOffe, content.length()));
    }
    return sb.toString();
}
Also used : SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) Keyword(org.ansj.app.keyword.Keyword) SmartGetWord(org.nlpcn.commons.lang.tire.SmartGetWord)

Example 3 with SmartForest

use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.

the class AnsjAnalyzer method getTokenizer.

/**
	 * 获得一个tokenizer
	 * 
	 * @param reader
	 * @param type
	 * @param filter
	 * @return
	 */
public static Tokenizer getTokenizer(Reader reader, Map<String, String> args) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("to create tokenizer " + args);
    }
    Analysis analysis = null;
    String temp = null;
    String type = args.get("type");
    if (type == null) {
        type = AnsjAnalyzer.TYPE.base_ansj.name();
    }
    switch(AnsjAnalyzer.TYPE.valueOf(type)) {
        case base_ansj:
            analysis = new BaseAnalysis();
            break;
        case index_ansj:
            analysis = new IndexAnalysis();
            break;
        case dic_ansj:
            analysis = new DicAnalysis();
            break;
        case query_ansj:
            analysis = new ToAnalysis();
            break;
        case nlp_ansj:
            analysis = new NlpAnalysis();
            if (StringUtil.isNotBlank(temp = args.get(CrfLibrary.DEFAULT))) {
                ((NlpAnalysis) analysis).setCrfModel(CrfLibrary.get(temp));
            }
            break;
        default:
            analysis = new BaseAnalysis();
    }
    if (reader != null) {
        analysis.resetContent(reader);
    }
    if (StringUtil.isNotBlank(temp = args.get(DicLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        Forest[] forests = new Forest[split.length];
        for (int i = 0; i < forests.length; i++) {
            if (StringUtil.isBlank(split[i])) {
                continue;
            }
            forests[i] = DicLibrary.get(split[i]);
        }
        analysis.setForests(forests);
    }
    List<StopRecognition> filters = null;
    if (StringUtil.isNotBlank(temp = args.get(StopLibrary.DEFAULT))) {
        //用户自定义词典
        String[] split = temp.split(",");
        filters = new ArrayList<StopRecognition>();
        for (String key : split) {
            StopRecognition stop = StopLibrary.get(key.trim());
            if (stop != null)
                filters.add(stop);
        }
    }
    List<SynonymsRecgnition> synonyms = null;
    if (StringUtil.isNotBlank(temp = args.get(SynonymsLibrary.DEFAULT))) {
        //同义词词典
        String[] split = temp.split(",");
        synonyms = new ArrayList<SynonymsRecgnition>();
        for (String key : split) {
            SmartForest<List<String>> sf = SynonymsLibrary.get(key.trim());
            if (sf != null)
                synonyms.add(new SynonymsRecgnition(sf));
        }
    }
    if (StringUtil.isNotBlank(temp = args.get(AmbiguityLibrary.DEFAULT))) {
        //歧义词典
        analysis.setAmbiguityForest(AmbiguityLibrary.get(temp.trim()));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNameRecognition"))) {
        // 是否开启人名识别
        analysis.setIsNameRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isNumRecognition"))) {
        // 是否开启数字识别
        analysis.setIsNumRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isQuantifierRecognition"))) {
        //量词识别
        analysis.setIsQuantifierRecognition(Boolean.valueOf(temp));
    }
    if (StringUtil.isNotBlank(temp = args.get("isRealName"))) {
        //是否保留原字符
        analysis.setIsRealName(Boolean.valueOf(temp));
    }
    return new AnsjTokenizer(analysis, filters, synonyms);
}
Also used : StopRecognition(org.ansj.recognition.impl.StopRecognition) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) SynonymsRecgnition(org.ansj.recognition.impl.SynonymsRecgnition) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) Analysis(org.ansj.splitWord.Analysis) NlpAnalysis(org.ansj.splitWord.analysis.NlpAnalysis) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) IndexAnalysis(org.ansj.splitWord.analysis.IndexAnalysis) BaseAnalysis(org.ansj.splitWord.analysis.BaseAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) ToAnalysis(org.ansj.splitWord.analysis.ToAnalysis) Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) DicAnalysis(org.ansj.splitWord.analysis.DicAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) AnsjTokenizer(org.ansj.lucene.util.AnsjTokenizer)

Example 4 with SmartForest

use of org.nlpcn.commons.lang.tire.domain.SmartForest in project ansj_seg by NLPchina.

the class SummaryComputer method explan.

/**
	 * 计算摘要
	 * 
	 * @param keyword
	 * @param content
	 * @return
	 */
private Summary explan(List<Keyword> keywords, String content) {
    SmartForest<Double> sf = new SmartForest<Double>();
    for (Keyword keyword : keywords) {
        sf.add(keyword.getName(), keyword.getScore());
    }
    // 先断句
    List<Sentence> sentences = toSentenceList(content.toCharArray());
    for (Sentence sentence : sentences) {
        computeScore(sentence, sf);
    }
    double maxScore = 0;
    int maxIndex = 0;
    MapCount<String> mc = new MapCount<>();
    for (int i = 0; i < sentences.size(); i++) {
        double tempScore = sentences.get(i).score;
        int tempLength = sentences.get(i).value.length();
        mc.addAll(sentences.get(i).mc.get());
        if (tempLength >= len) {
            tempScore = tempScore * mc.get().size();
            if (maxScore < tempScore) {
                maxScore = tempScore;
                maxIndex = i;
                continue;
            }
            mc.get().clear();
        }
        for (int j = i + 1; j < sentences.size(); j++) {
            tempScore += sentences.get(j).score;
            tempLength += sentences.get(j).value.length();
            mc.addAll(sentences.get(j).mc.get());
            if (tempLength >= len) {
                tempScore = tempScore * mc.get().size();
                if (maxScore < tempScore) {
                    maxScore = tempScore;
                    maxIndex = i;
                }
                mc.get().clear();
                break;
            }
        }
        if (tempLength < len) {
            tempScore = tempScore * mc.get().size();
            if (maxScore < tempScore) {
                maxScore = tempScore;
                maxIndex = i;
                break;
            }
            mc.get().clear();
        }
    }
    StringBuilder sb = new StringBuilder();
    for (int i = maxIndex; i < sentences.size(); i++) {
        sb.append(sentences.get(i).value);
        if (sb.length() > len) {
            break;
        }
    }
    String summaryStr = sb.toString();
    if (isSplitSummary && sb.length() > len) {
        double value = len;
        StringBuilder newSummary = new StringBuilder();
        char c = 0;
        for (int i = 0; i < sb.length(); i++) {
            c = sb.charAt(i);
            if (c < 256) {
                value -= 0.5;
            } else {
                value -= 1;
            }
            if (value < 0) {
                break;
            }
            newSummary.append(c);
        }
        summaryStr = newSummary.toString();
    }
    return new Summary(keywords, summaryStr);
}
Also used : Keyword(org.ansj.app.keyword.Keyword) MapCount(org.nlpcn.commons.lang.util.MapCount) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) Summary(org.ansj.app.summary.pojo.Summary)

Aggregations

SmartForest (org.nlpcn.commons.lang.tire.domain.SmartForest)4 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Keyword (org.ansj.app.keyword.Keyword)2 AnsjTokenizer (org.ansj.lucene.util.AnsjTokenizer)2 StopRecognition (org.ansj.recognition.impl.StopRecognition)2 SynonymsRecgnition (org.ansj.recognition.impl.SynonymsRecgnition)2 Analysis (org.ansj.splitWord.Analysis)2 BaseAnalysis (org.ansj.splitWord.analysis.BaseAnalysis)2 DicAnalysis (org.ansj.splitWord.analysis.DicAnalysis)2 IndexAnalysis (org.ansj.splitWord.analysis.IndexAnalysis)2 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)2 ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)2 Forest (org.nlpcn.commons.lang.tire.domain.Forest)2 Summary (org.ansj.app.summary.pojo.Summary)1 SmartGetWord (org.nlpcn.commons.lang.tire.SmartGetWord)1 MapCount (org.nlpcn.commons.lang.util.MapCount)1