use of org.ansj.domain.TermNatures in project ansj_seg by NLPchina.
the class NatureRecognition method getTermNatures.
/**
* 传入一次词语获得相关的词性
*
* @param word
* @return
*/
public TermNatures getTermNatures(String word) {
String[] params = null;
// 获得词性 , 先从系统辞典。在从用户自定义辞典
AnsjItem ansjItem = DATDictionary.getItem(word);
TermNatures tn = null;
if (ansjItem != AnsjItem.NULL) {
tn = ansjItem.termNatures;
} else if ((params = getParams(word)) != null) {
tn = new TermNatures(new TermNature(params[0], 1));
} else if (WordAlert.isEnglish(word)) {
tn = TermNatures.EN;
} else if (WordAlert.isNumber(word)) {
tn = TermNatures.M;
} else {
tn = TermNatures.NULL;
}
return tn;
}
use of org.ansj.domain.TermNatures in project ansj_seg by NLPchina.
the class NatureRecognition method recognition.
/**
* 传入一组。词对词语进行。词性标注
*
* @param words
* @param offe
* @return
*/
public List<Term> recognition(List<String> words, int offe) {
List<Term> terms = new ArrayList<Term>(words.size());
int tempOffe = 0;
for (String word : words) {
TermNatures tn = getTermNatures(word);
terms.add(new Term(word, offe + tempOffe, tn));
tempOffe += word.length();
}
new NatureRecognition().recognition(new Result(terms));
return terms;
}
use of org.ansj.domain.TermNatures in project ansj_seg by NLPchina.
the class Analysis method analysisStr.
/**
* 一整句话分词,用户设置的歧异优先
*
* @param temp
* @return
*/
private List<Term> analysisStr(String temp) {
Graph gp = new Graph(temp);
int startOffe = 0;
if (this.ambiguityForest != null) {
GetWord gw = new GetWord(this.ambiguityForest, gp.chars);
String[] params = null;
while ((gw.getFrontWords()) != null) {
if (gw.offe > startOffe) {
analysis(gp, startOffe, gw.offe);
}
params = gw.getParams();
startOffe = gw.offe;
for (int i = 0; i < params.length; i += 2) {
gp.addTerm(new Term(params[i], startOffe, new TermNatures(new TermNature(params[i + 1], 1))));
startOffe += params[i].length();
}
}
}
if (startOffe < gp.chars.length) {
analysis(gp, startOffe, gp.chars.length);
}
List<Term> result = this.getResult(gp);
return result;
}
use of org.ansj.domain.TermNatures in project ansj_seg by NLPchina.
the class Graph method merger.
/**
* 具体的遍历打分方法
*
* @param i 起始位置
* @param j 起始属性
* @param to
*/
private void merger(Term fromTerm, int to, Map<String, Double> relationMap) {
Term term = null;
if (terms[to] != null) {
term = terms[to];
while (term != null) {
// 关系式to.set(from)
term.setPathScore(fromTerm, relationMap);
term = term.next();
}
} else {
char c = chars[to];
TermNatures tn = DATDictionary.getItem(c).termNatures;
if (tn == null || tn == TermNatures.NULL) {
tn = TermNatures.NULL;
}
terms[to] = new Term(String.valueOf(c), to, tn);
terms[to].setPathScore(fromTerm, relationMap);
}
}
use of org.ansj.domain.TermNatures in project ansj_seg by NLPchina.
the class NlpAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
if (learn == null) {
learn = new LearnTool();
}
graph.walkPath();
learn.learn(graph, splitWord, forests);
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
if (splitWord != null) {
MapCount<String> mc = new MapCount<String>();
// 通过crf分词
List<String> words = splitWord.cut(graph.chars);
Term tempTerm = null;
int tempOff = 0;
if (words.size() > 0) {
String word = words.get(0);
if (!isRuleWord(word)) {
mc.add("始##始" + TAB + word, CRF_WEIGHT);
}
}
for (String word : words) {
// 尝试从词典获取词性
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
Term term = null;
if (termNatures != TermNatures.NULL) {
term = new Term(word, tempOff, termNatures);
} else {
term = new Term(word, tempOff, TermNatures.NW);
term.setNewWord(true);
}
// 增加偏移量
tempOff += word.length();
if (isRuleWord(word)) {
// 如果word不对那么不要了
tempTerm = null;
continue;
}
if (term.isNewWord()) {
// 尝试猜测词性
termNatures = NatureRecognition.guessNature(word);
term.updateTermNaturesAndNature(termNatures);
}
TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
// 对于非词典中的词持有保守态度
if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
}
tempTerm = term;
if (term.isNewWord()) {
learn.addTerm(new NewWord(word, Nature.NW));
}
}
if (tempTerm != null && !tempTerm.isNewWord()) {
mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
}
graph.walkPath(mc.get());
} else {
LOG.warn("not find any crf model, make sure your config right? ");
}
// 数字发现
if (graph.hasNum && isNumRecognition) {
new NumRecognition().recognition(graph.terms);
}
// 词性标注
List<Term> result = getResult();
// 用户自定义词典的识别
new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
// 进行新词发现
new NewWordRecognition(learn).recognition(graph.terms);
graph.walkPathByScore();
// 优化后重新获得最优路径
result = getResult();
// 激活辞典
for (Term term : result) {
learn.active(term.getName());
}
setRealName(graph, result);
return result;
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] == null) {
continue;
}
result.add(graph.terms[i]);
}
return result;
}
};
return merger.merger();
}
Aggregations