Search in sources :

Example 1 with NatureRecognition

use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.

the class NatureTagDemo method main.

public static void main(String[] args) {
    String[] strs = { "对", "非", "ansj", "的", "分词", "结果", "进行", "词性", "标注" };
    List<String> lists = Arrays.asList(strs);
    List<Term> recognition = new NatureRecognition().recognition(lists, 0);
    System.out.println(recognition);
}
Also used : NatureRecognition(org.ansj.recognition.impl.NatureRecognition) Term(org.ansj.domain.Term)

Example 2 with NatureRecognition

use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.

the class NameFix method nameAmbiguity.

/**
	 * 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- , ·,•则连接
	 */
public static void nameAmbiguity(Term[] terms, Forest... forests) {
    Term from = null;
    Term term = null;
    Term next = null;
    for (int i = 0; i < terms.length - 1; i++) {
        term = terms[i];
        if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) {
            next = terms[i + 2];
            if (next.termNatures().personAttr.split > 0) {
                term.setName(term.getName() + next.getName().charAt(0));
                terms[i + 2] = null;
                String name = next.getName().substring(1);
                terms[i + 3] = new Term(name, next.getOffe() + 1, new NatureRecognition(forests).getTermNatures(name));
                TermUtil.termLink(term, terms[i + 3]);
                TermUtil.termLink(terms[i + 3], next.to());
            }
        }
    }
    // 外国人名修正
    for (int i = 0; i < terms.length; i++) {
        term = terms[i];
        if (term != null && term.getName().length() == 1 && i > 0 && WordAlert.CharCover(term.getName().charAt(0)) == '·') {
            from = term.from();
            next = term.to();
            if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) {
                from.setName(from.getName() + term.getName() + next.getName());
                TermUtil.termLink(from, next.to());
                terms[i] = null;
                terms[i + 1] = null;
            }
        }
    }
}
Also used : NatureRecognition(org.ansj.recognition.impl.NatureRecognition) Term(org.ansj.domain.Term)

Example 3 with NatureRecognition

use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.

the class NlpAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            if (learn == null) {
                learn = new LearnTool();
            }
            graph.walkPath();
            learn.learn(graph, splitWord, forests);
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            if (splitWord != null) {
                MapCount<String> mc = new MapCount<String>();
                // 通过crf分词
                List<String> words = splitWord.cut(graph.chars);
                Term tempTerm = null;
                int tempOff = 0;
                if (words.size() > 0) {
                    String word = words.get(0);
                    if (!isRuleWord(word)) {
                        mc.add("始##始" + TAB + word, CRF_WEIGHT);
                    }
                }
                for (String word : words) {
                    // 尝试从词典获取词性
                    TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
                    Term term = null;
                    if (termNatures != TermNatures.NULL) {
                        term = new Term(word, tempOff, termNatures);
                    } else {
                        term = new Term(word, tempOff, TermNatures.NW);
                        term.setNewWord(true);
                    }
                    // 增加偏移量
                    tempOff += word.length();
                    if (isRuleWord(word)) {
                        // 如果word不对那么不要了
                        tempTerm = null;
                        continue;
                    }
                    if (term.isNewWord()) {
                        // 尝试猜测词性
                        termNatures = NatureRecognition.guessNature(word);
                        term.updateTermNaturesAndNature(termNatures);
                    }
                    TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
                    // 对于非词典中的词持有保守态度
                    if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
                        mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
                    }
                    tempTerm = term;
                    if (term.isNewWord()) {
                        learn.addTerm(new NewWord(word, Nature.NW));
                    }
                }
                if (tempTerm != null && !tempTerm.isNewWord()) {
                    mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
                }
                graph.walkPath(mc.get());
            } else {
                LOG.warn("not find any crf model, make sure your config right? ");
            }
            // 数字发现
            if (graph.hasNum && isNumRecognition) {
                new NumRecognition().recognition(graph.terms);
            }
            // 词性标注
            List<Term> result = getResult();
            // 用户自定义词典的识别
            new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
            // 进行新词发现
            new NewWordRecognition(learn).recognition(graph.terms);
            graph.walkPathByScore();
            // 优化后重新获得最优路径
            result = getResult();
            // 激活辞典
            for (Term term : result) {
                learn.active(term.getName());
            }
            setRealName(graph, result);
            return result;
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] == null) {
                    continue;
                }
                result.add(graph.terms[i]);
            }
            return result;
        }
    };
    return merger.merger();
}
Also used : TermNatures(org.ansj.domain.TermNatures) ArrayList(java.util.ArrayList) MapCount(org.nlpcn.commons.lang.util.MapCount) NewWordRecognition(org.ansj.recognition.arrimpl.NewWordRecognition) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) NatureRecognition(org.ansj.recognition.impl.NatureRecognition) LearnTool(org.ansj.dic.LearnTool) NewWord(org.ansj.domain.NewWord)

Example 4 with NatureRecognition

use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.

the class NatureDemo method main.

public static void main(String[] args) throws IOException {
    Result terms = ToAnalysis.parse("Ansj中文分词是一个真正的ict的实现.并且加入了自己的一些数据结构和算法的分词.实现了高效率和高准确率的完美结合!");
    //词性标注
    terms.recognition(new NatureRecognition());
    System.out.println(terms);
}
Also used : NatureRecognition(org.ansj.recognition.impl.NatureRecognition) Result(org.ansj.domain.Result)

Aggregations

NatureRecognition (org.ansj.recognition.impl.NatureRecognition)4 Term (org.ansj.domain.Term)3 ArrayList (java.util.ArrayList)1 LearnTool (org.ansj.dic.LearnTool)1 NewWord (org.ansj.domain.NewWord)1 Result (org.ansj.domain.Result)1 TermNatures (org.ansj.domain.TermNatures)1 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)1 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)1 NewWordRecognition (org.ansj.recognition.arrimpl.NewWordRecognition)1 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)1 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)1 MapCount (org.nlpcn.commons.lang.util.MapCount)1