use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class Analysis method next.
/**
* while 循环调用.直到返回为null则分词结束
*
* @return
* @throws IOException
*/
public Term next() throws IOException {
Term term = null;
if (!terms.isEmpty()) {
term = terms.poll();
term.updateOffe(offe);
return term;
}
String temp = br.readLine();
offe = br.getStart();
while (StringUtil.isBlank(temp)) {
if (temp == null) {
return null;
} else {
temp = br.readLine();
}
}
// 歧异处理字符串
fullTerms(temp);
if (!terms.isEmpty()) {
term = terms.poll();
term.updateOffe(offe);
return term;
}
return null;
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class DicAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
if (forests == null) {
return;
}
int beginOff = graph.terms[0].getOffe();
Forest forest = null;
for (int i = forests.length - 1; i >= 0; i--) {
forest = forests[i];
if (forest == null) {
continue;
}
GetWord word = forest.getWord(graph.chars);
String temp = null;
int tempFreq = 50;
while ((temp = word.getAllWords()) != null) {
if (graph.terms[word.offe] == null) {
continue;
}
tempFreq = getInt(word.getParam()[1], 50);
Term term = new Term(temp, beginOff + word.offe, word.getParam()[0], tempFreq);
term.selfScore(-1 * Math.pow(Math.log(tempFreq), temp.length()));
TermUtil.insertTerm(graph.terms, term, InsertTermType.REPLACE);
}
}
graph.rmLittlePath();
graph.walkPathByScore();
graph.rmLittlePath();
}
private int getInt(String str, int def) {
try {
return Integer.parseInt(str);
} catch (NumberFormatException e) {
return def;
}
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class IndexAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
return result();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
}
/**
* 检索的分词
*
* @return
*/
private List<Term> result() {
String temp = null;
Set<String> set = new HashSet<String>();
List<Term> result = new LinkedList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
set.add(graph.terms[i].getName() + graph.terms[i].getOffe());
}
}
LinkedList<Term> last = new LinkedList<Term>();
char[] chars = graph.chars;
if (forests != null) {
for (Forest forest : forests) {
if (forest == null) {
continue;
}
GetWord word = forest.getWord(chars);
while ((temp = word.getAllWords()) != null) {
if (!set.contains(temp + word.offe)) {
set.add(temp + word.offe);
last.add(new Term(temp, word.offe, word.getParam(0), ObjConver.getIntValue(word.getParam(1))));
}
}
}
}
result.addAll(last);
Collections.sort(result, new Comparator<Term>() {
@Override
public int compare(Term o1, Term o2) {
if (o1.getOffe() == o2.getOffe()) {
return o2.getName().length() - o1.getName().length();
} else {
return o1.getOffe() - o2.getOffe();
}
}
});
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class NlpAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
if (learn == null) {
learn = new LearnTool();
}
graph.walkPath();
learn.learn(graph, splitWord, forests);
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
if (splitWord != null) {
MapCount<String> mc = new MapCount<String>();
// 通过crf分词
List<String> words = splitWord.cut(graph.chars);
Term tempTerm = null;
int tempOff = 0;
if (words.size() > 0) {
String word = words.get(0);
if (!isRuleWord(word)) {
mc.add("始##始" + TAB + word, CRF_WEIGHT);
}
}
for (String word : words) {
// 尝试从词典获取词性
TermNatures termNatures = new NatureRecognition(forests).getTermNatures(word);
Term term = null;
if (termNatures != TermNatures.NULL) {
term = new Term(word, tempOff, termNatures);
} else {
term = new Term(word, tempOff, TermNatures.NW);
term.setNewWord(true);
}
// 增加偏移量
tempOff += word.length();
if (isRuleWord(word)) {
// 如果word不对那么不要了
tempTerm = null;
continue;
}
if (term.isNewWord()) {
// 尝试猜测词性
termNatures = NatureRecognition.guessNature(word);
term.updateTermNaturesAndNature(termNatures);
}
TermUtil.insertTerm(graph.terms, term, InsertTermType.SCORE_ADD_SORT);
// 对于非词典中的词持有保守态度
if (tempTerm != null && !tempTerm.isNewWord() && !term.isNewWord()) {
mc.add(tempTerm.getName() + TAB + word, CRF_WEIGHT);
}
tempTerm = term;
if (term.isNewWord()) {
learn.addTerm(new NewWord(word, Nature.NW));
}
}
if (tempTerm != null && !tempTerm.isNewWord()) {
mc.add(tempTerm.getName() + TAB + "末##末", CRF_WEIGHT);
}
graph.walkPath(mc.get());
} else {
LOG.warn("not find any crf model, make sure your config right? ");
}
// 数字发现
if (graph.hasNum && isNumRecognition) {
new NumRecognition().recognition(graph.terms);
}
// 词性标注
List<Term> result = getResult();
// 用户自定义词典的识别
new UserDefineRecognition(InsertTermType.SCORE_ADD_SORT, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
// 进行新词发现
new NewWordRecognition(learn).recognition(graph.terms);
graph.walkPathByScore();
// 优化后重新获得最优路径
result = getResult();
// 激活辞典
for (Term term : result) {
learn.active(term.getName());
}
setRealName(graph, result);
return result;
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] == null) {
continue;
}
result.add(graph.terms[i]);
}
return result;
}
};
return merger.merger();
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class ToAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
Aggregations