Search in sources :

Example 1 with NewWord

use of org.ansj.domain.NewWord in project ansj_seg by NLPchina.

the class NlpAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            if (learn == null) {
                learn = new LearnTool();
            }
            graph.walkPath();
            learn.learn(graph, splitWord, forests);
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            if (splitWord != null) {
                MapCount<String> mc = new MapCount<String>();
                // 通过crf分词
                List<String> words = splitWord.cut(graph.chars);
                Term tempTerm = null;
                int tempOff = 0;
                if (words.size() > 0) {
                    String word = words.get(0);
                    if (!isRuleWord(word)) {
                        mc.add("始##始" + TAB + word, CRF_WEIGHT);
                    }
                }
                for (String word : words) {
                    // 尝试从词典获取词性
                    TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
                    Term term = null;
                    if (termNatures != TermNatures.NULL) {
                        term = new Term(word, tempOff, termNatures);
                    } else {
                        term = new Term(word, tempOff, TermNatures.NW);
                        term.setNewWord(true);
                    }
                    // 增加偏移量
                    tempOff += word.length();
                    if (isRuleWord(word)) {
                        // 如果word不对那么不要了
                        tempTerm = null;
                        continue;
                    }
                    if (term.isNewWord()) {
                        // 尝试猜测词性
                        termNatures = NatureRecognition.guessNature(word);
                        term.updateTermNaturesAndNature(termNatures);
                    }
                    TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
                    // 对于非词典中的词持有保守态度
                    if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
                        mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
                    }
                    tempTerm = term;
                    if (term.isNewWord()) {
                        learn.addTerm(new NewWord(word, Nature.NW));
                    }
                }
                if (tempTerm != null && !tempTerm.isNewWord()) {
                    mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
                }
                graph.walkPath(mc.get());
            } else {
                LOG.warn("not find any crf model, make sure your config right? ");
            }
            // 数字发现
            if (graph.hasNum && isNumRecognition) {
                new NumRecognition().recognition(graph.terms);
            }
            // 词性标注
            List<Term> result = getResult();
            // 用户自定义词典的识别
            new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
            // 进行新词发现
            new NewWordRecognition(learn).recognition(graph.terms);
            graph.walkPathByScore();
            // 优化后重新获得最优路径
            result = getResult();
            // 激活辞典
            for (Term term : result) {
                learn.active(term.getName());
            }
            setRealName(graph, result);
            return result;
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] == null) {
                    continue;
                }
                result.add(graph.terms[i]);
            }
            return result;
        }
    };
    return merger.merger();
}
Also used : TermNatures(org.ansj.domain.TermNatures) ArrayList(java.util.ArrayList) MapCount(org.nlpcn.commons.lang.util.MapCount) NewWordRecognition(org.ansj.recognition.arrimpl.NewWordRecognition) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) NatureRecognition(org.ansj.recognition.impl.NatureRecognition) LearnTool(org.ansj.dic.LearnTool) NewWord(org.ansj.domain.NewWord)

Example 2 with NewWord

use of org.ansj.domain.NewWord in project ansj_seg by NLPchina.

the class LearnTool method findForeignPerson.

private void findForeignPerson(Graph graph) {
    List<NewWord> newWords = new ForeignPersonRecognition().getNewWords(graph.terms);
    addListToTerm(newWords);
}
Also used : ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) NewWord(org.ansj.domain.NewWord)

Example 3 with NewWord

use of org.ansj.domain.NewWord in project ansj_seg by NLPchina.

the class LearnTool method addTerm.

/**
	 * 增加一个新词到树中
	 * 
	 * @param newWord
	 */
public void addTerm(NewWord newWord) {
    NewWord temp = null;
    SmartForest<NewWord> smartForest = null;
    if ((smartForest = sf.getBranch(newWord.getName())) != null && smartForest.getParam() != null) {
        temp = smartForest.getParam();
        temp.update(newWord.getNature(), newWord.getAllFreq());
    } else {
        count++;
        if (splitWord == null) {
            newWord.setScore(-1);
        } else {
            newWord.setScore(-splitWord.cohesion(newWord.getName()));
        }
        synchronized (sf) {
            sf.add(newWord.getName(), newWord);
        }
    }
}
Also used : NewWord(org.ansj.domain.NewWord)

Example 4 with NewWord

use of org.ansj.domain.NewWord in project ansj_seg by NLPchina.

the class LearnTool method findAsianPerson.

private void findAsianPerson(Graph graph) {
    List<NewWord> newWords = new AsianPersonRecognition().getNewWords(graph.terms);
    addListToTerm(newWords);
}
Also used : AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NewWord(org.ansj.domain.NewWord)

Example 5 with NewWord

use of org.ansj.domain.NewWord in project ansj_seg by NLPchina.

the class ForeignPersonRecognition method getNewWords.

public List<NewWord> getNewWords(Term[] terms) {
    this.terms = terms;
    List<NewWord> all = new ArrayList<NewWord>();
    String name = null;
    Term term = null;
    reset();
    for (int i = 0; i < terms.length; i++) {
        if (terms[i] == null) {
            continue;
        }
        term = terms[i];
        // 如果名字的开始是人名的前缀,或者后缀.那么忽略
        if (tempList.size() == 0) {
            if (term.termNatures().personAttr.end > 10) {
                continue;
            }
            if ((terms[i].getName().length() == 1 && ISNOTFIRST.contains(terms[i].getName().charAt(0)))) {
                continue;
            }
        }
        name = term.getName();
        if (term.termNatures() == TermNatures.NR || term.termNatures() == TermNatures.NW || name.length() == 1) {
            boolean flag = validate(name);
            if (flag) {
                tempList.add(term);
            }
        } else if (tempList.size() == 1) {
            reset();
        } else if (tempList.size() > 1) {
            StringBuilder sb = new StringBuilder();
            for (Term temp : tempList) {
                sb.append(temp.getName());
            }
            all.add(new NewWord(sb.toString(), Nature.NRF));
            reset();
        }
    }
    return all;
}
Also used : ArrayList(java.util.ArrayList) Term(org.ansj.domain.Term) NewWord(org.ansj.domain.NewWord)

Aggregations

NewWord (org.ansj.domain.NewWord)7 ArrayList (java.util.ArrayList)3 Term (org.ansj.domain.Term)3 LearnTool (org.ansj.dic.LearnTool)2 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)2 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)2 File (java.io.File)1 Entry (java.util.Map.Entry)1 TermNatures (org.ansj.domain.TermNatures)1 NewWordRecognition (org.ansj.recognition.arrimpl.NewWordRecognition)1 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)1 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)1 NatureRecognition (org.ansj.recognition.impl.NatureRecognition)1 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)1 MapCount (org.nlpcn.commons.lang.util.MapCount)1