use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class FileDemo method main.
public static void main(String[] args) throws IOException {
//
// MyStaticValue.isRealName = true;
BufferedReader reader = IOUtil.getReader("/home/ansj/temp/360baikeData/360tag_all.txt", "utf-8");
ToAnalysis.parse("test 123 孙");
Analysis na = new BaseAnalysis(reader);
long start = System.currentTimeMillis();
int allCount = 0;
Term term = null;
while ((term = na.next()) != null) {
if (term.getOffe() % 10000 == 0)
System.out.println(term.getOffe() + "\t" + term.getName());
allCount += term.getName().length();
if (allCount > 30000000) {
break;
}
}
long end = System.currentTimeMillis();
System.out.println(end - start);
System.out.println("共 " + allCount + " 个字符,每秒处理了:" + (allCount * 1000.0 / (end - start)));
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class NatureTagDemo method main.
public static void main(String[] args) {
String[] strs = { "对", "非", "ansj", "的", "分词", "结果", "进行", "词性", "标注" };
List<String> lists = Arrays.asList(strs);
List<Term> recognition = new NatureRecognition().recognition(lists, 0);
System.out.println(recognition);
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class NlpDemoTest method main.
public static void main(String[] args) throws IOException {
NlpAnalysis nlp = (NlpAnalysis) new NlpAnalysis().setForests(new Forest[] { DicLibrary.get() });
nlp.resetContent(new StringReader("2015年无锡市突发环境事件"));
Term term = nlp.next();
while (term != null) {
System.out.println(term.getRealName() + "\t|\t" + term.getName());
term = nlp.next();
}
// System.out.println(parse);
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class AsianPersonRecognition method recogntion_.
private List<Term> recogntion_() {
Term term = null;
Term tempTerm = null;
List<Term> termList = new ArrayList<Term>();
int beginFreq = 10;
for (int i = 0; i < terms.length; i++) {
term = terms[i];
if (term == null || !term.termNatures().personAttr.flag) {
continue;
}
term.score(0);
term.selfScore(0);
int freq = 0;
for (int j = 2; j > -1; j--) {
freq = term.termNatures().personAttr.getFreq(j, 0);
if ((freq > 10) || (term.getName().length() == 2 && freq > 10)) {
tempTerm = nameFind(i, beginFreq, j);
if (tempTerm != null) {
termList.add(tempTerm);
// 如果是无争议性识别
if (skip) {
for (int j2 = i; j2 < tempTerm.toValue(); j2++) {
if (terms[j2] != null) {
terms[j2].score(0);
terms[j2].selfScore(0);
}
}
i = tempTerm.toValue() - 1;
break;
}
}
}
}
beginFreq = term.termNatures().personAttr.begin + 1;
}
return termList;
}
use of org.ansj.domain.Term in project ansj_seg by NLPchina.
the class AsianPersonRecognition method nameFind.
/**
* 人名识别
*
* @param term
* @param offe
* @param freq
*/
private Term nameFind(int offe, int beginFreq, int size) {
StringBuilder sb = new StringBuilder();
int undefinite = 0;
skip = false;
PersonNatureAttr pna = null;
int index = 0;
int freq = 0;
double allFreq = 0;
Term term = null;
int i = offe;
for (; i < terms.length; i++) {
// 走到结尾处识别出来一个名字.
if (terms[i] == null) {
continue;
}
term = terms[i];
pna = term.termNatures().personAttr;
// 在这个长度的这个位置的词频,如果没有可能就干掉,跳出循环
if ((freq = pna.getFreq(size, index)) == 0) {
return null;
}
if (pna.allFreq > 0) {
undefinite++;
}
sb.append(term.getName());
allFreq += Math.log(term.termNatures().allFreq + 1);
allFreq += -Math.log((freq));
index++;
if (index == size + 2) {
break;
}
}
double score = -Math.log(FACTORY[size]);
score += allFreq;
double endFreq = 0;
// 开始寻找结尾词
boolean flag = true;
while (flag) {
i++;
if (i >= terms.length) {
endFreq = 10;
flag = false;
} else if (terms[i] != null) {
int twoWordFreq = NgramLibrary.getTwoWordFreq(term, terms[i]);
if (twoWordFreq > 3) {
return null;
}
endFreq = terms[i].termNatures().personAttr.end + 1;
flag = false;
}
}
score -= Math.log(endFreq);
score -= Math.log(beginFreq);
if (score > -3) {
return null;
}
if (allFreq > 0 && undefinite > 0) {
return null;
}
skip = undefinite == 0;
term = new Term(sb.toString(), offe, TermNatures.NR);
term.selfScore(score);
return term;
}
Aggregations