Search in sources :

Example 11 with Forest

use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.

the class UserDefineRecognition method recognition.

public void recognition(Term[] terms) {
    this.terms = terms;
    for (Forest forest : forests) {
        if (forest == null) {
            continue;
        }
        reset();
        this.forest = forest;
        branch = forest;
        int length = terms.length - 1;
        boolean flag = true;
        for (int i = 0; i < length; i++) {
            if (terms[i] == null)
                continue;
            if (branch == forest) {
                flag = false;
            } else {
                flag = true;
            }
            branch = termStatus(branch, terms[i]);
            if (branch == null) {
                if (offe != -1) {
                    i = offe;
                }
                reset();
            } else if (branch.getStatus() == 3) {
                endOffe = i;
                tempNature = branch.getParam()[0];
                tempFreq = getInt(branch.getParam()[1], 50);
                if (offe != -1 && offe < endOffe) {
                    i = offe;
                    makeNewTerm();
                    reset();
                } else {
                    reset();
                }
            } else if (branch.getStatus() == 2) {
                endOffe = i;
                if (offe == -1) {
                    offe = i;
                } else {
                    tempNature = branch.getParam()[0];
                    tempFreq = getInt(branch.getParam()[1], 50);
                    if (flag) {
                        makeNewTerm();
                    }
                }
            } else if (branch.getStatus() == 1) {
                if (offe == -1) {
                    offe = i;
                }
            }
        }
        if (offe != -1 && offe < endOffe) {
            makeNewTerm();
        }
    }
}
Also used : Forest(org.nlpcn.commons.lang.tire.domain.Forest) SmartForest(org.nlpcn.commons.lang.tire.domain.SmartForest)

Example 12 with Forest

use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.

the class DicAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            // 用户自定义词典的识别
            userDefineRecognition(graph, forests);
            graph.walkPath();
            // 数字发现
            if (isNumRecognition && graph.hasNum) {
                new NumRecognition().recognition(graph.terms);
            }
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            return getResult();
        }

        private void userDefineRecognition(final Graph graph, Forest... forests) {
            if (forests == null) {
                return;
            }
            int beginOff = graph.terms[0].getOffe();
            Forest forest = null;
            for (int i = forests.length - 1; i >= 0; i--) {
                forest = forests[i];
                if (forest == null) {
                    continue;
                }
                GetWord word = forest.getWord(graph.chars);
                String temp = null;
                int tempFreq = 50;
                while ((temp = word.getAllWords()) != null) {
                    if (graph.terms[word.offe] == null) {
                        continue;
                    }
                    tempFreq = getInt(word.getParam()[1], 50);
                    Term term = new Term(temp, beginOff + word.offe, word.getParam()[0], tempFreq);
                    term.selfScore(-1 * Math.pow(Math.log(tempFreq), temp.length()));
                    TermUtil.insertTerm(graph.terms, term, InsertTermType.REPLACE);
                }
            }
            graph.rmLittlePath();
            graph.walkPathByScore();
            graph.rmLittlePath();
        }

        private int getInt(String str, int def) {
            try {
                return Integer.parseInt(str);
            } catch (NumberFormatException e) {
                return def;
            }
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                }
            }
            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}
Also used : ArrayList(java.util.ArrayList) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) Graph(org.ansj.util.Graph) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) Forest(org.nlpcn.commons.lang.tire.domain.Forest) GetWord(org.nlpcn.commons.lang.tire.GetWord)

Example 13 with Forest

use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.

the class IndexAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            graph.walkPath();
            // 数字发现
            if (isNumRecognition && graph.hasNum) {
                new NumRecognition().recognition(graph.terms);
            }
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            // 用户自定义词典的识别
            userDefineRecognition(graph, forests);
            return result();
        }

        private void userDefineRecognition(final Graph graph, Forest... forests) {
            new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
        }

        /**
			 * 检索的分词
			 * 
			 * @return
			 */
        private List<Term> result() {
            String temp = null;
            Set<String> set = new HashSet<String>();
            List<Term> result = new LinkedList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                    set.add(graph.terms[i].getName() + graph.terms[i].getOffe());
                }
            }
            LinkedList<Term> last = new LinkedList<Term>();
            char[] chars = graph.chars;
            if (forests != null) {
                for (Forest forest : forests) {
                    if (forest == null) {
                        continue;
                    }
                    GetWord word = forest.getWord(chars);
                    while ((temp = word.getAllWords()) != null) {
                        if (!set.contains(temp + word.offe)) {
                            set.add(temp + word.offe);
                            last.add(new Term(temp, word.offe, word.getParam(0), ObjConver.getIntValue(word.getParam(1))));
                        }
                    }
                }
            }
            result.addAll(last);
            Collections.sort(result, new Comparator<Term>() {

                @Override
                public int compare(Term o1, Term o2) {
                    if (o1.getOffe() == o2.getOffe()) {
                        return o2.getName().length() - o1.getName().length();
                    } else {
                        return o1.getOffe() - o2.getOffe();
                    }
                }
            });
            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}
Also used : Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition) LinkedList(java.util.LinkedList) NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) Graph(org.ansj.util.Graph) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) Forest(org.nlpcn.commons.lang.tire.domain.Forest) HashSet(java.util.HashSet) GetWord(org.nlpcn.commons.lang.tire.GetWord)

Example 14 with Forest

use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.

the class ToAnalysis method getResult.

@Override
protected List<Term> getResult(final Graph graph) {
    Merger merger = new Merger() {

        @Override
        public List<Term> merger() {
            graph.walkPath();
            // 数字发现
            if (isNumRecognition && graph.hasNum) {
                new NumRecognition().recognition(graph.terms);
            }
            // 姓名识别
            if (graph.hasPerson && isNameRecognition) {
                // 亚洲人名识别
                new AsianPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
                NameFix.nameAmbiguity(graph.terms);
                // 外国人名识别
                new ForeignPersonRecognition().recognition(graph.terms);
                graph.walkPathByScore();
            }
            // 用户自定义词典的识别
            userDefineRecognition(graph, forests);
            return getResult();
        }

        private void userDefineRecognition(final Graph graph, Forest... forests) {
            new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
            graph.rmLittlePath();
            graph.walkPathByScore();
        }

        private List<Term> getResult() {
            List<Term> result = new ArrayList<Term>();
            int length = graph.terms.length - 1;
            for (int i = 0; i < length; i++) {
                if (graph.terms[i] != null) {
                    result.add(graph.terms[i]);
                }
            }
            setRealName(graph, result);
            return result;
        }
    };
    return merger.merger();
}
Also used : NumRecognition(org.ansj.recognition.arrimpl.NumRecognition) Graph(org.ansj.util.Graph) ForeignPersonRecognition(org.ansj.recognition.arrimpl.ForeignPersonRecognition) UserDefineRecognition(org.ansj.recognition.arrimpl.UserDefineRecognition) ArrayList(java.util.ArrayList) Forest(org.nlpcn.commons.lang.tire.domain.Forest) Term(org.ansj.domain.Term) AsianPersonRecognition(org.ansj.recognition.arrimpl.AsianPersonRecognition)

Example 15 with Forest

use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.

the class AmbiguityLibrary method init.

/**
	 * 加载
	 * 
	 * @return
	 */
private static synchronized Forest init(String key, KV<String, Forest> kv) {
    Forest forest = kv.getV();
    if (forest != null) {
        return forest;
    }
    forest = new Forest();
    try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "utf-8")) {
        String temp;
        LOG.debug("begin init ambiguity");
        long start = System.currentTimeMillis();
        while ((temp = br.readLine()) != null) {
            if (StringUtil.isNotBlank(temp)) {
                temp = StringUtil.trim(temp);
                String[] split = temp.split("\t");
                StringBuilder sb = new StringBuilder();
                if (split.length % 2 != 0) {
                    LOG.error("init ambiguity  error in line :" + temp + " format err !");
                    continue;
                }
                for (int i = 0; i < split.length; i += 2) {
                    sb.append(split[i]);
                }
                forest.addBranch(sb.toString(), split);
            }
        }
        LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
        kv.setV(forest);
        return forest;
    } catch (Exception e) {
        LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getK());
        AMBIGUITY.remove(key);
        return null;
    }
}
Also used : BufferedReader(java.io.BufferedReader) Forest(org.nlpcn.commons.lang.tire.domain.Forest)

Aggregations

Forest (org.nlpcn.commons.lang.tire.domain.Forest)20 ArrayList (java.util.ArrayList)4 Term (org.ansj.domain.Term)4 Value (org.nlpcn.commons.lang.tire.domain.Value)4 AsianPersonRecognition (org.ansj.recognition.arrimpl.AsianPersonRecognition)3 ForeignPersonRecognition (org.ansj.recognition.arrimpl.ForeignPersonRecognition)3 NumRecognition (org.ansj.recognition.arrimpl.NumRecognition)3 NlpAnalysis (org.ansj.splitWord.analysis.NlpAnalysis)3 Graph (org.ansj.util.Graph)3 SmartForest (org.nlpcn.commons.lang.tire.domain.SmartForest)3 BufferedReader (java.io.BufferedReader)2 List (java.util.List)2 AnsjTokenizer (org.ansj.lucene.util.AnsjTokenizer)2 UserDefineRecognition (org.ansj.recognition.arrimpl.UserDefineRecognition)2 StopRecognition (org.ansj.recognition.impl.StopRecognition)2 SynonymsRecgnition (org.ansj.recognition.impl.SynonymsRecgnition)2 Analysis (org.ansj.splitWord.Analysis)2 BaseAnalysis (org.ansj.splitWord.analysis.BaseAnalysis)2 DicAnalysis (org.ansj.splitWord.analysis.DicAnalysis)2 IndexAnalysis (org.ansj.splitWord.analysis.IndexAnalysis)2