Search in sources :

Example 1 with MapCount

use of org.nlpcn.commons.lang.util.MapCount in project ansj_seg by NLPchina.

the class Model method writeModel.

/**
	 * 将model序列化到硬盘
	 * 
	 * @param path
	 * @throws IOException
	 * @throws FileNotFoundException
	 */
public void writeModel(String path) {
    try (FileOutputStream fso = new FileOutputStream(path)) {
        ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(fso));
        oos.writeUTF(CRFModel.version);
        oos.writeObject(status);
        oos.writeObject(config.getTemplate());
        Map<String, float[]> map = featureTree.toMap();
        MapCount<Integer> mc = new MapCount<Integer>();
        for (float[] v : map.values()) {
            mc.add(v.length);
        }
        for (Entry<Integer, Double> entry : mc.get().entrySet()) {
            int win = entry.getKey();
            // 宽度
            oos.writeInt(win);
            // 个数
            oos.writeInt(entry.getValue().intValue());
            for (Entry<String, float[]> e : map.entrySet()) {
                if (e.getValue().length == win) {
                    oos.writeUTF(e.getKey());
                    float[] value = e.getValue();
                    for (int i = 0; i < win; i++) {
                        oos.writeFloat(value[i]);
                    }
                }
            }
        }
        oos.writeInt(0);
        oos.writeInt(0);
        oos.flush();
    } catch (FileNotFoundException e) {
        logger.warn("文件没有找到", e);
    } catch (IOException e) {
        logger.warn("IO异常", e);
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) MapCount(org.nlpcn.commons.lang.util.MapCount) IOException(java.io.IOException) ObjectOutputStream(java.io.ObjectOutputStream) GZIPOutputStream(java.util.zip.GZIPOutputStream) FileOutputStream(java.io.FileOutputStream)

Example 2 with MapCount

use of org.nlpcn.commons.lang.util.MapCount in project ansj_seg by NLPchina.

the class NlpAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            if (learn == null) {
                learn = new LearnTool();
            }
            graph.walkPath();
            learn.learn(graph, splitWord, forests);
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            if (splitWord != null) {
                MapCount<String> mc = new MapCount<String>();
                // 通过crf分词
                List<String> words = splitWord.cut(graph.chars);
                Term tempTerm = null;
                int tempOff = 0;
                if (words.size() > 0) {
                    String word = words.get(0);
                    if (!isRuleWord(word)) {
                        mc.add("始##始" + TAB + word, CRF_WEIGHT);
                    }
                }
                for (String word : words) {
                    // 尝试从词典获取词性
                    TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
                    Term term = null;
                    if (termNatures != TermNatures.NULL) {
                        term = new Term(word, tempOff, termNatures);
                    } else {
                        term = new Term(word, tempOff, TermNatures.NW);
                        term.setNewWord(true);
                    }
                    // 增加偏移量
                    tempOff += word.length();
                    if (isRuleWord(word)) {
                        // 如果word不对那么不要了
                        tempTerm = null;
                        continue;
                    }
                    if (term.isNewWord()) {
                        // 尝试猜测词性
                        termNatures = NatureRecognition.guessNature(word);
                        term.updateTermNaturesAndNature(termNatures);
                    }
                    TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
                    // 对于非词典中的词持有保守态度
                    if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
                        mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
                    }
                    tempTerm = term;
                    if (term.isNewWord()) {
                        learn.addTerm(new NewWord(word, Nature.NW));
                    }
                }
                if (tempTerm != null && !tempTerm.isNewWord()) {
                    mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
                }
                graph.walkPath(mc.get());
            } else {
                LOG.warn("not find any crf model, make sure your config right? ");
            }
            // 数字发现
            if (graph.hasNum && isNumRecognition) {
                new NumRecognition().recognition(graph.terms);
            }
            // 词性标注
            List<Term> result = getResult();
            // 用户自定义词典的识别
            new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
            // 进行新词发现
            new NewWordRecognition(learn).recognition(graph.terms);
            graph.walkPathByScore();
            // 优化后重新获得最优路径
            result = getResult();
            // 激活辞典
            for (Term term : result) {
                learn.active(term.getName());
            }
            setRealName(graph, result);
            return result;
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] == null) {
                    continue;
                }
                result.add(graph.terms[i]);
            }
            return result;
        }
    };
    return merger.merger();
}
Also used : TermNatures(org.ansj.domain.TermNatures) ArrayList(java.util.ArrayList) MapCount(org.nlpcn.commons.lang.util.MapCount) NewWordRecognition(org.ansj.recognition.arrimpl.NewWordRecognition) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) NatureRecognition(org.ansj.recognition.impl.NatureRecognition) LearnTool(org.ansj.dic.LearnTool) NewWord(org.ansj.domain.NewWord)

Example 3 with MapCount

use of org.nlpcn.commons.lang.util.MapCount in project ansj_seg by NLPchina.

the class SummaryComputer method explan.

/**
	 * 计算摘要
	 * 
	 * @param keyword
	 * @param content
	 * @return
	 */
private Summary explan(List<Keyword> keywords, String content) {
    SmartForest<Double> sf = new SmartForest<Double>();
    for (Keyword keyword : keywords) {
        sf.add(keyword.getName(), keyword.getScore());
    }
    // 先断句
    List<Sentence> sentences = toSentenceList(content.toCharArray());
    for (Sentence sentence : sentences) {
        computeScore(sentence, sf);
    }
    double maxScore = 0;
    int maxIndex = 0;
    MapCount<String> mc = new MapCount<>();
    for (int i = 0; i < sentences.size(); i++) {
        double tempScore = sentences.get(i).score;
        int tempLength = sentences.get(i).value.length();
        mc.addAll(sentences.get(i).mc.get());
        if (tempLength >= len) {
            tempScore = tempScore * mc.get().size();
            if (maxScore < tempScore) {
                maxScore = tempScore;
                maxIndex = i;
                continue;
            }
            mc.get().clear();
        }
        for (int j = i + 1; j < sentences.size(); j++) {
            tempScore += sentences.get(j).score;
            tempLength += sentences.get(j).value.length();
            mc.addAll(sentences.get(j).mc.get());
            if (tempLength >= len) {
                tempScore = tempScore * mc.get().size();
                if (maxScore < tempScore) {
                    maxScore = tempScore;
                    maxIndex = i;
                }
                mc.get().clear();
                break;
            }
        }
        if (tempLength < len) {
            tempScore = tempScore * mc.get().size();
            if (maxScore < tempScore) {
                maxScore = tempScore;
                maxIndex = i;
                break;
            }
            mc.get().clear();
        }
    }
    StringBuilder sb = new StringBuilder();
    for (int i = maxIndex; i < sentences.size(); i++) {
        sb.append(sentences.get(i).value);
        if (sb.length() > len) {
            break;
        }
    }
    String summaryStr = sb.toString();
    if (isSplitSummary && sb.length() > len) {
        double value = len;
        StringBuilder newSummary = new StringBuilder();
        char c = 0;
        for (int i = 0; i < sb.length(); i++) {
            c = sb.charAt(i);
            if (c < 256) {
                value -= 0.5;
            } else {
                value -= 1;
            }
            if (value < 0) {
                break;
            }
            newSummary.append(c);
        }
        summaryStr = newSummary.toString();
    }
    return new Summary(keywords, summaryStr);
}
Also used : Keyword(org.ansj.app.keyword.Keyword) MapCount(org.nlpcn.commons.lang.util.MapCount) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest) Summary(org.ansj.app.summary.pojo.Summary)

Aggregations

MapCount (org.nlpcn.commons.lang.util.MapCount)3 FileNotFoundException (java.io.FileNotFoundException)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 ObjectOutputStream (java.io.ObjectOutputStream)1 ArrayList (java.util.ArrayList)1 GZIPOutputStream (java.util.zip.GZIPOutputStream)1 Keyword (org.ansj.app.keyword.Keyword)1 Summary (org.ansj.app.summary.pojo.Summary)1 LearnTool (org.ansj.dic.LearnTool)1 NewWord (org.ansj.domain.NewWord)1 Term (org.ansj.domain.Term)1 TermNatures (org.ansj.domain.TermNatures)1 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)1 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)1 NewWordRecognition (org.ansj.recognition.arrimpl.NewWordRecognition)1 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)1 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)1 NatureRecognition (org.ansj.recognition.impl.NatureRecognition)1 SmartForest (org.nlpcn.commons.lang.tire.domain.SmartForest)1