use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.
the class UserDefineRecognition method recognition.
public void recognition(Term[] terms) {
this.terms = terms;
for (Forest forest : forests) {
if (forest == null) {
continue;
}
reset();
this.forest = forest;
branch = forest;
int length = terms.length - 1;
boolean flag = true;
for (int i = 0; i < length; i++) {
if (terms[i] == null)
continue;
if (branch == forest) {
flag = false;
} else {
flag = true;
}
branch = termStatus(branch, terms[i]);
if (branch == null) {
if (offe != -1) {
i = offe;
}
reset();
} else if (branch.getStatus() == 3) {
endOffe = i;
tempNature = branch.getParam()[0];
tempFreq = getInt(branch.getParam()[1], 50);
if (offe != -1 && offe < endOffe) {
i = offe;
makeNewTerm();
reset();
} else {
reset();
}
} else if (branch.getStatus() == 2) {
endOffe = i;
if (offe == -1) {
offe = i;
} else {
tempNature = branch.getParam()[0];
tempFreq = getInt(branch.getParam()[1], 50);
if (flag) {
makeNewTerm();
}
}
} else if (branch.getStatus() == 1) {
if (offe == -1) {
offe = i;
}
}
}
if (offe != -1 && offe < endOffe) {
makeNewTerm();
}
}
}
use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.
the class DicAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
if (forests == null) {
return;
}
int beginOff = graph.terms[0].getOffe();
Forest forest = null;
for (int i = forests.length - 1; i >= 0; i--) {
forest = forests[i];
if (forest == null) {
continue;
}
GetWord word = forest.getWord(graph.chars);
String temp = null;
int tempFreq = 50;
while ((temp = word.getAllWords()) != null) {
if (graph.terms[word.offe] == null) {
continue;
}
tempFreq = getInt(word.getParam()[1], 50);
Term term = new Term(temp, beginOff + word.offe, word.getParam()[0], tempFreq);
term.selfScore(-1 * Math.pow(Math.log(tempFreq), temp.length()));
TermUtil.insertTerm(graph.terms, term, InsertTermType.REPLACE);
}
}
graph.rmLittlePath();
graph.walkPathByScore();
graph.rmLittlePath();
}
private int getInt(String str, int def) {
try {
return Integer.parseInt(str);
} catch (NumberFormatException e) {
return def;
}
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.
the class IndexAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
return result();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
}
/**
* 检索的分词
*
* @return
*/
private List<Term> result() {
String temp = null;
Set<String> set = new HashSet<String>();
List<Term> result = new LinkedList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
set.add(graph.terms[i].getName() + graph.terms[i].getOffe());
}
}
LinkedList<Term> last = new LinkedList<Term>();
char[] chars = graph.chars;
if (forests != null) {
for (Forest forest : forests) {
if (forest == null) {
continue;
}
GetWord word = forest.getWord(chars);
while ((temp = word.getAllWords()) != null) {
if (!set.contains(temp + word.offe)) {
set.add(temp + word.offe);
last.add(new Term(temp, word.offe, word.getParam(0), ObjConver.getIntValue(word.getParam(1))));
}
}
}
}
result.addAll(last);
Collections.sort(result, new Comparator<Term>() {
@Override
public int compare(Term o1, Term o2) {
if (o1.getOffe() == o2.getOffe()) {
return o2.getName().length() - o1.getName().length();
} else {
return o1.getOffe() - o2.getOffe();
}
}
});
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.
the class ToAnalysis method getResult.
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
use of org.nlpcn.commons.lang.tire.domain.Forest in project ansj_seg by NLPchina.
the class AmbiguityLibrary method init.
/**
* 加载
*
* @return
*/
private static synchronized Forest init(String key, KV<String, Forest> kv) {
Forest forest = kv.getV();
if (forest != null) {
return forest;
}
forest = new Forest();
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "utf-8")) {
String temp;
LOG.debug("begin init ambiguity");
long start = System.currentTimeMillis();
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
String[] split = temp.split("\t");
StringBuilder sb = new StringBuilder();
if (split.length % 2 != 0) {
LOG.error("init ambiguity error in line :" + temp + " format err !");
continue;
}
for (int i = 0; i < split.length; i += 2) {
sb.append(split[i]);
}
forest.addBranch(sb.toString(), split);
}
}
LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
kv.setV(forest);
return forest;
} catch (Exception e) {
LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getK());
AMBIGUITY.remove(key);
return null;
}
}
Aggregations