use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class DemoCRFSegment method main.
public static void main(String[] args) {
// 关闭词性显示
HanLP.Config.ShowTermNature = false;
Segment segment = new CRFSegment().enableCustomDictionary(false);
String[] sentenceArray = new String[] { "HanLP是由一系列模型与算法组成的Java工具包,目标是普及自然语言处理在生产环境中的应用。", // 繁体无压力
"鐵桿部隊憤怒情緒集結 馬英九腹背受敵", "馬英九回應連勝文“丐幫說”:稱黨內同志談話應謹慎", // 专业名词有一定辨识能力
"高锰酸钾,强氧化剂,紫红色晶体,可溶于水,遇乙醇即被还原。常用作消毒剂、水净化剂、氧化剂、漂白剂、毒气吸收剂、二氧化碳精制剂等。", // 非新闻语料
"《夜晚的骰子》通过描述浅草的舞女在暗夜中扔骰子的情景,寄托了作者对庶民生活区的情感", // 微博
"这个像是真的[委屈]前面那个打扮太江户了,一点不上品...@hankcs", "鼎泰丰的小笼一点味道也没有...每样都淡淡的...淡淡的,哪有食堂2A的好次", "克里斯蒂娜·克罗尔说:不,我不是虎妈。我全家都热爱音乐,我也鼓励他们这么做。", "今日APPS:Sago Mini Toolbox培养孩子动手能力", "财政部副部长王保安调任国家统计局党组书记", "2.34米男子娶1.53米女粉丝 称夫妻生活没问题", "你看过穆赫兰道吗", "国办发布网络提速降费十四条指导意见 鼓励流量不清零", "乐视超级手机能否承载贾布斯的生态梦" };
for (String sentence : sentenceArray) {
List<Term> termList = segment.seg(sentence);
System.out.println(termList);
}
/**
* 内存CookBook:
* HanLP内部有智能的内存池,对于同一个CRF模型(模型文件路径作为id区分),只要它没被释放或者内存充足,就不会重新加载。
*/
for (int i = 0; i < 5; ++i) {
segment = new CRFSegment();
}
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class DemoJapaneseNameRecognition method main.
public static void main(String[] args) {
String[] testCase = new String[] { "北川景子参演了林诣彬导演的《速度与激情3》", "林志玲亮相网友:确定不是波多野结衣?", "龟山千广和近藤公园在龟山公园里喝酒赏花" };
Segment segment = HanLP.newSegment().enableJapaneseNameRecognize(true);
for (String sentence : testCase) {
List<Term> termList = segment.seg(sentence);
System.out.println(termList);
}
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class MinimumSpanningTreeParser method parse.
@Override
public CoNLLSentence parse(List<Term> termList) {
if (termList == null || termList.size() == 0)
return null;
termList.add(0, new Term("##核心##", Nature.begin));
Node[] nodeArray = new Node[termList.size()];
Iterator<Term> iterator = termList.iterator();
for (int i = 0; i < nodeArray.length; ++i) {
nodeArray[i] = new Node(iterator.next(), i);
}
Edge[][] edges = new Edge[nodeArray.length][nodeArray.length];
for (int i = 0; i < edges.length; ++i) {
for (int j = 0; j < edges[i].length; ++j) {
if (i != j) {
edges[j][i] = makeEdge(nodeArray, i, j);
}
}
}
// 最小生成树Prim算法
int max_v = nodeArray.length * (nodeArray.length - 1);
float[] mincost = new float[max_v];
Arrays.fill(mincost, Float.MAX_VALUE / 3);
boolean[] used = new boolean[max_v];
Arrays.fill(used, false);
used[0] = true;
PriorityQueue<State> que = new PriorityQueue<State>();
// 找虚根的唯一孩子
float minCostToRoot = Float.MAX_VALUE;
Edge firstEdge = null;
Edge[] edgeResult = new Edge[termList.size() - 1];
for (Edge edge : edges[0]) {
if (edge == null)
continue;
if (minCostToRoot > edge.cost) {
firstEdge = edge;
minCostToRoot = edge.cost;
}
}
if (firstEdge == null)
return null;
que.add(new State(minCostToRoot, firstEdge.from, firstEdge));
while (!que.isEmpty()) {
State p = que.poll();
int v = p.id;
if (used[v] || p.cost > mincost[v])
continue;
used[v] = true;
if (p.edge != null) {
// System.out.println(p.edge.from + " " + p.edge.to + p.edge.label);
edgeResult[p.edge.from - 1] = p.edge;
}
for (Edge e : edges[v]) {
if (e == null)
continue;
if (mincost[e.from] > e.cost) {
mincost[e.from] = e.cost;
que.add(new State(mincost[e.from], e.from, e));
}
}
}
CoNLLWord[] wordArray = new CoNLLWord[termList.size() - 1];
for (int i = 0; i < wordArray.length; ++i) {
wordArray[i] = new CoNLLWord(i + 1, nodeArray[i + 1].word, nodeArray[i + 1].label);
wordArray[i].DEPREL = edgeResult[i].label;
}
for (int i = 0; i < edgeResult.length; ++i) {
int index = edgeResult[i].to - 1;
if (index < 0) {
wordArray[i].HEAD = CoNLLWord.ROOT;
continue;
}
wordArray[i].HEAD = wordArray[index];
}
return new CoNLLSentence(wordArray);
}
Aggregations