use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.
the class NatureTagDemo method main.
public static void main(String[] args) {
String[] strs = { "对", "非", "ansj", "的", "分词", "结果", "进行", "词性", "标注" };
List<String> lists = Arrays.asList(strs);
List<Term> recognition = new NatureRecognition().recognition(lists, 0);
System.out.println(recognition);
}
use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.
the class NameFix method nameAmbiguity.
/**
* 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- , ·,•则连接
*/
public static void nameAmbiguity(Term[] terms, Forest... forests) {
Term from = null;
Term term = null;
Term next = null;
for (int i = 0; i < terms.length - 1; i++) {
term = terms[i];
if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) {
next = terms[i + 2];
if (next.termNatures().personAttr.split > 0) {
term.setName(term.getName() + next.getName().charAt(0));
terms[i + 2] = null;
String name = next.getName().substring(1);
terms[i + 3] = new Term(name, next.getOffe() + 1, new NatureRecognition(forests).getTermNatures(name));
TermUtil.termLink(term, terms[i + 3]);
TermUtil.termLink(terms[i + 3], next.to());
}
}
}
// 外国人名修正
for (int i = 0; i < terms.length; i++) {
term = terms[i];
if (term != null && term.getName().length() == 1 && i > 0 && WordAlert.CharCover(term.getName().charAt(0)) == '·') {
from = term.from();
next = term.to();
if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) {
from.setName(from.getName() + term.getName() + next.getName());
TermUtil.termLink(from, next.to());
terms[i] = null;
terms[i + 1] = null;
}
}
}
}
use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.
the class NlpAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
if (learn == null) {
learn = new LearnTool();
}
graph.walkPath();
learn.learn(graph, splitWord, forests);
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
if (splitWord != null) {
MapCount<String> mc = new MapCount<String>();
// 通过crf分词
List<String> words = splitWord.cut(graph.chars);
Term tempTerm = null;
int tempOff = 0;
if (words.size() > 0) {
String word = words.get(0);
if (!isRuleWord(word)) {
mc.add("始##始" + TAB + word, CRF_WEIGHT);
}
}
for (String word : words) {
// 尝试从词典获取词性
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
Term term = null;
if (termNatures != TermNatures.NULL) {
term = new Term(word, tempOff, termNatures);
} else {
term = new Term(word, tempOff, TermNatures.NW);
term.setNewWord(true);
}
// 增加偏移量
tempOff += word.length();
if (isRuleWord(word)) {
// 如果word不对那么不要了
tempTerm = null;
continue;
}
if (term.isNewWord()) {
// 尝试猜测词性
termNatures = NatureRecognition.guessNature(word);
term.updateTermNaturesAndNature(termNatures);
}
TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
// 对于非词典中的词持有保守态度
if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
}
tempTerm = term;
if (term.isNewWord()) {
learn.addTerm(new NewWord(word, Nature.NW));
}
}
if (tempTerm != null && !tempTerm.isNewWord()) {
mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
}
graph.walkPath(mc.get());
} else {
LOG.warn("not find any crf model, make sure your config right? ");
}
// 数字发现
if (graph.hasNum && isNumRecognition) {
new NumRecognition().recognition(graph.terms);
}
// 词性标注
List<Term> result = getResult();
// 用户自定义词典的识别
new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
// 进行新词发现
new NewWordRecognition(learn).recognition(graph.terms);
graph.walkPathByScore();
// 优化后重新获得最优路径
result = getResult();
// 激活辞典
for (Term term : result) {
learn.active(term.getName());
}
setRealName(graph, result);
return result;
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] == null) {
continue;
}
result.add(graph.terms[i]);
}
return result;
}
};
return merger.merger();
}
use of org.ansj.recognition.impl.NatureRecognition in project ansj_seg by NLPchina.
the class NatureDemo method main.
public static void main(String[] args) throws IOException {
Result terms = ToAnalysis.parse("Ansj中文分词是一个真正的ict的实现.并且加入了自己的一些数据结构和算法的分词.实现了高效率和高准确率的完美结合!");
//词性标注
terms.recognition(new NatureRecognition());
System.out.println(terms);
}
Aggregations