Search in sources :

Example 31 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class Analysis method next.

/**
	 * while 循环调用.直到返回为null则分词结束
	 * 
	 * @return
	 * @throws IOException
	 */
public Term next() throws IOException {
    Term term = null;
    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }
    String temp = br.readLine();
    offe = br.getStart();
    while (StringUtil.isBlank(temp)) {
        if (temp == null) {
            return null;
        } else {
            temp = br.readLine();
        }
    }
    // 歧异处理字符串
    fullTerms(temp);
    if (!terms.isEmpty()) {
        term = terms.poll();
        term.updateOffe(offe);
        return term;
    }
    return null;
}
Also used : Term(org.ansj.domain.Term)

Example 32 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class DicAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            // 用户自定义词典的识别
            userDefineRecognition(graph, forests);
            graph.walkPath();
            // 数字发现
            if (isNumRecognition && graph.hasNum) {
                new NumRecognition().recognition(graph.terms);
            }
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            return getResult();
        }

        private void userDefineRecognition(final Graph graph, Forest... forests) {
            if (forests == null) {
                return;
            }
            int beginOff = graph.terms[0].getOffe();
            Forest forest = null;
            for (int i = forests.length - 1; i >= 0; i--) {
                forest = forests[i];
                if (forest == null) {
                    continue;
                }
                GetWord word = forest.getWord(graph.chars);
                String temp = null;
                int tempFreq = 50;
                while ((temp = word.getAllWords()) != null) {
                    if (graph.terms[word.offe] == null) {
                        continue;
                    }
                    tempFreq = getInt(word.getParam()[1], 50);
                    Term term = new Term(temp, beginOff + word.offe, word.getParam()[0], tempFreq);
                    term.selfScore(-1 * Math.pow(Math.log(tempFreq), temp.length()));
                    TermUtil.insertTerm(graph.terms, term, InsertTermType.REPLACE);
                }
            }
            graph.rmLittlePath();
            graph.walkPathByScore();
            graph.rmLittlePath();
        }

        private int getInt(String str, int def) {
            try {
                return Integer.parseInt(str);
            } catch (NumberFormatException e) {
                return def;
            }
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                }
            }
            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}
Also used : ArrayList(java.util.ArrayList) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) Graph(org.ansj.util.Graph) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) Forest(org.nlpcn.commons.lang.tire.domain.Forest) GetWord(org.nlpcn.commons.lang.tire.GetWord)

Example 33 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class IndexAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            graph.walkPath();
            // 数字发现
            if (isNumRecognition && graph.hasNum) {
                new NumRecognition().recognition(graph.terms);
            }
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            // 用户自定义词典的识别
            userDefineRecognition(graph, forests);
            return result();
        }

        private void userDefineRecognition(final Graph graph, Forest... forests) {
            new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
        }

        /**
			 * 检索的分词
			 * 
			 * @return
			 */
        private List<Term> result() {
            String temp = null;
            Set<String> set = new HashSet<String>();
            List<Term> result = new LinkedList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                    set.add(graph.terms[i].getName() + graph.terms[i].getOffe());
                }
            }
            LinkedList<Term> last = new LinkedList<Term>();
            char[] chars = graph.chars;
            if (forests != null) {
                for (Forest forest : forests) {
                    if (forest == null) {
                        continue;
                    }
                    GetWord word = forest.getWord(chars);
                    while ((temp = word.getAllWords()) != null) {
                        if (!set.contains(temp + word.offe)) {
                            set.add(temp + word.offe);
                            last.add(new Term(temp, word.offe, word.getParam(0), ObjConver.getIntValue(word.getParam(1))));
                        }
                    }
                }
            }
            result.addAll(last);
            Collections.sort(result, new Comparator<Term>() {

                @Override
                public int compare(Term o1, Term o2) {
                    if (o1.getOffe() == o2.getOffe()) {
                        return o2.getName().length() - o1.getName().length();
                    } else {
                        return o1.getOffe() - o2.getOffe();
                    }
                }
            });
            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}
Also used : Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) LinkedList(java.util.LinkedList) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) Graph(org.ansj.util.Graph) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) Forest(org.nlpcn.commons.lang.tire.domain.Forest) HashSet(java.util.HashSet) GetWord(org.nlpcn.commons.lang.tire.GetWord)

Example 34 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class NlpAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            if (learn == null) {
                learn = new LearnTool();
            }
            graph.walkPath();
            learn.learn(graph, splitWord, forests);
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            if (splitWord != null) {
                MapCount<String> mc = new MapCount<String>();
                // 通过crf分词
                List<String> words = splitWord.cut(graph.chars);
                Term tempTerm = null;
                int tempOff = 0;
                if (words.size() > 0) {
                    String word = words.get(0);
                    if (!isRuleWord(word)) {
                        mc.add("始##始" + TAB + word, CRF_WEIGHT);
                    }
                }
                for (String word : words) {
                    // 尝试从词典获取词性
                    TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
                    Term term = null;
                    if (termNatures != TermNatures.NULL) {
                        term = new Term(word, tempOff, termNatures);
                    } else {
                        term = new Term(word, tempOff, TermNatures.NW);
                        term.setNewWord(true);
                    }
                    // 增加偏移量
                    tempOff += word.length();
                    if (isRuleWord(word)) {
                        // 如果word不对那么不要了
                        tempTerm = null;
                        continue;
                    }
                    if (term.isNewWord()) {
                        // 尝试猜测词性
                        termNatures = NatureRecognition.guessNature(word);
                        term.updateTermNaturesAndNature(termNatures);
                    }
                    TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
                    // 对于非词典中的词持有保守态度
                    if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
                        mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
                    }
                    tempTerm = term;
                    if (term.isNewWord()) {
                        learn.addTerm(new NewWord(word, Nature.NW));
                    }
                }
                if (tempTerm != null && !tempTerm.isNewWord()) {
                    mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
                }
                graph.walkPath(mc.get());
            } else {
                LOG.warn("not find any crf model, make sure your config right? ");
            }
            // 数字发现
            if (graph.hasNum && isNumRecognition) {
                new NumRecognition().recognition(graph.terms);
            }
            // 词性标注
            List<Term> result = getResult();
            // 用户自定义词典的识别
            new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
            // 进行新词发现
            new NewWordRecognition(learn).recognition(graph.terms);
            graph.walkPathByScore();
            // 优化后重新获得最优路径
            result = getResult();
            // 激活辞典
            for (Term term : result) {
                learn.active(term.getName());
            }
            setRealName(graph, result);
            return result;
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] == null) {
                    continue;
                }
                result.add(graph.terms[i]);
            }
            return result;
        }
    };
    return merger.merger();
}
Also used : TermNatures(org.ansj.domain.TermNatures) ArrayList(java.util.ArrayList) MapCount(org.nlpcn.commons.lang.util.MapCount) NewWordRecognition(org.ansj.recognition.arrimpl.NewWordRecognition) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) NatureRecognition(org.ansj.recognition.impl.NatureRecognition) LearnTool(org.ansj.dic.LearnTool) NewWord(org.ansj.domain.NewWord)

Example 35 with Term

use of org.ansj.domain.Term in project ansj_seg by NLPchina.

the class ToAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            graph.walkPath();
            // 数字发现
            if (isNumRecognition && graph.hasNum) {
                new NumRecognition().recognition(graph.terms);
            }
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            // 用户自定义词典的识别
            userDefineRecognition(graph, forests);
            return getResult();
        }

        private void userDefineRecognition(final Graph graph, Forest... forests) {
            new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                }
            }
            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}
Also used : NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) Graph(org.ansj.util.Graph) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) ArrayList(java.util.ArrayList) Forest(org.nlpcn.commons.lang.tire.domain.Forest) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition)

Aggregations

Term (org.ansj.domain.Term)55 ArrayList (java.util.ArrayList)10 Result (org.ansj.domain.Result)8 Test (org.junit.Test)8 TermNatures (org.ansj.domain.TermNatures)5 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)4 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)4 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)4 Graph (org.ansj.util.Graph)4 Forest (org.nlpcn.commons.lang.tire.domain.Forest)4 LinkedList (java.util.LinkedList)3 NewWord (org.ansj.domain.NewWord)3 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)3 NatureRecognition (org.ansj.recognition.impl.NatureRecognition)3 GetWord (org.nlpcn.commons.lang.tire.GetWord)3 BufferedReader (java.io.BufferedReader)2 HashMap (java.util.HashMap)2 TermNature (org.ansj.domain.TermNature)2 ToAnalysis (org.ansj.splitWord.analysis.ToAnalysis)2 Analyzer (org.apache.lucene.analysis.Analyzer)2