use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Dijkstra method compute.
public static List<Vertex> compute(Graph graph) {
List<Vertex> resultList = new LinkedList<Vertex>();
Vertex[] vertexes = graph.getVertexes();
List<EdgeFrom>[] edgesTo = graph.getEdgesTo();
double[] d = new double[vertexes.length];
Arrays.fill(d, Double.MAX_VALUE);
d[d.length - 1] = 0;
int[] path = new int[vertexes.length];
Arrays.fill(path, -1);
PriorityQueue<State> que = new PriorityQueue<State>();
que.add(new State(0, vertexes.length - 1));
while (!que.isEmpty()) {
State p = que.poll();
if (d[p.vertex] < p.cost)
continue;
for (EdgeFrom edgeFrom : edgesTo[p.vertex]) {
if (d[edgeFrom.from] > d[p.vertex] + edgeFrom.weight) {
d[edgeFrom.from] = d[p.vertex] + edgeFrom.weight;
que.add(new State(d[edgeFrom.from], edgeFrom.from));
path[edgeFrom.from] = p.vertex;
}
}
}
for (int t = 0; t != -1; t = path[t]) {
resultList.add(vertexes[t]);
}
return resultList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Viterbi method compute.
/**
* 特化版的求解HMM模型
*
* @param vertexList 包含Vertex.B节点的路径
* @param transformMatrixDictionary 词典对应的转移矩阵
*/
public static void compute(List<Vertex> vertexList, TransformMatrixDictionary<Nature> transformMatrixDictionary) {
int length = vertexList.size() - 1;
// 滚动数组
double[][] cost = new double[2][];
Iterator<Vertex> iterator = vertexList.iterator();
Vertex start = iterator.next();
Nature pre = start.attribute.nature[0];
// 第一个是确定的
// start.confirmNature(pre);
// 第二个也可以简单地算出来
Vertex preItem;
Nature[] preTagSet;
{
Vertex item = iterator.next();
cost[0] = new double[item.attribute.nature.length];
int j = 0;
int curIndex = 0;
for (Nature cur : item.attribute.nature) {
cost[0][j] = transformMatrixDictionary.transititon_probability[pre.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[curIndex] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
++j;
++curIndex;
}
preTagSet = item.attribute.nature;
preItem = item;
}
// 第三个开始复杂一些
for (int i = 1; i < length; ++i) {
int index_i = i & 1;
int index_i_1 = 1 - index_i;
Vertex item = iterator.next();
cost[index_i] = new double[item.attribute.nature.length];
double perfect_cost_line = Double.MAX_VALUE;
int k = 0;
Nature[] curTagSet = item.attribute.nature;
for (Nature cur : curTagSet) {
cost[index_i][k] = Double.MAX_VALUE;
int j = 0;
for (Nature p : preTagSet) {
double now = cost[index_i_1][j] + transformMatrixDictionary.transititon_probability[p.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[k] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
if (now < cost[index_i][k]) {
cost[index_i][k] = now;
if (now < perfect_cost_line) {
perfect_cost_line = now;
pre = p;
}
}
++j;
}
++k;
}
preItem.confirmNature(pre);
preTagSet = curTagSet;
preItem = item;
}
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method combineByCustomDictionary.
/**
* 使用用户词典合并粗分结果,并将用户词语收集到全词图中
* @param vertexList 粗分结果
* @param wordNetAll 收集用户词语到全词图中
* @return 合并后的结果
*/
protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList, WordNet wordNetAll) {
Vertex[] wordNet = new Vertex[vertexList.size()];
vertexList.toArray(wordNet);
// DAT合并
int line = 1;
DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat;
for (int i = 0; i < wordNet.length; ++i) {
int state = 1;
state = dat.transition(wordNet[i].realWord, state);
if (state > 0) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = dat.output(state);
for (; to < wordNet.length; ++to) {
state = dat.transition(wordNet[to].realWord, state);
if (state < 0)
break;
CoreDictionary.Attribute output = dat.output(state);
if (output != null) {
value = output;
end = to + 1;
combineWords(wordNet, i, end, value);
wordNetAll.add(line, wordNet[i]);
}
}
if (value != null) {
line += wordNet[i].realWord.length();
i = end - 1;
}
} else {
line += wordNet[i].realWord.length();
}
}
// BinTrie合并
if (CustomDictionary.trie != null) {
line = 1;
for (int i = 0; i < wordNet.length; ++i) {
if (wordNet[i] == null)
continue;
BaseNode<CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
if (state != null) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = state.getValue();
for (; to < wordNet.length; ++to) {
if (wordNet[to] == null)
continue;
state = state.transition(wordNet[to].realWord.toCharArray(), 0);
if (state == null)
break;
if (state.getValue() != null) {
value = state.getValue();
end = to + 1;
combineWords(wordNet, i, end, value);
wordNetAll.add(line, wordNet[i]);
}
}
if (value != null) {
line += wordNet[i].realWord.length();
i = end - 1;
}
} else {
line += wordNet[i].realWord.length();
}
}
}
vertexList.clear();
for (Vertex vertex : wordNet) {
if (vertex != null)
vertexList.add(vertex);
}
return vertexList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method combineByCustomDictionary.
/**
* 使用用户词典合并粗分结果
* @param vertexList 粗分结果
* @return 合并后的结果
*/
protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList) {
Vertex[] wordNet = new Vertex[vertexList.size()];
vertexList.toArray(wordNet);
// DAT合并
DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat;
for (int i = 0; i < wordNet.length; ++i) {
int state = 1;
state = dat.transition(wordNet[i].realWord, state);
if (state > 0) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = dat.output(state);
for (; to < wordNet.length; ++to) {
state = dat.transition(wordNet[to].realWord, state);
if (state < 0)
break;
CoreDictionary.Attribute output = dat.output(state);
if (output != null) {
value = output;
end = to + 1;
}
}
if (value != null) {
combineWords(wordNet, i, end, value);
i = end - 1;
}
}
}
// BinTrie合并
if (CustomDictionary.trie != null) {
for (int i = 0; i < wordNet.length; ++i) {
if (wordNet[i] == null)
continue;
BaseNode<CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
if (state != null) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = state.getValue();
for (; to < wordNet.length; ++to) {
if (wordNet[to] == null)
continue;
state = state.transition(wordNet[to].realWord.toCharArray(), 0);
if (state == null)
break;
if (state.getValue() != null) {
value = state.getValue();
end = to + 1;
}
}
if (value != null) {
combineWords(wordNet, i, end, value);
i = end - 1;
}
}
}
}
vertexList.clear();
for (Vertex vertex : wordNet) {
if (vertex != null)
vertexList.add(vertex);
}
return vertexList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method removeFromWordNet.
/**
* 将一个词语从词网中彻底抹除
* @param cur 词语
* @param wordNetAll 词网
* @param line 当前扫描的行数
* @param length 当前缓冲区的长度
*/
private static void removeFromWordNet(Vertex cur, WordNet wordNetAll, int line, int length) {
LinkedList<Vertex>[] vertexes = wordNetAll.getVertexes();
// 将其从wordNet中删除
for (Vertex vertex : vertexes[line + length]) {
if (vertex.from == cur)
vertex.from = null;
}
ListIterator<Vertex> iterator = vertexes[line + length - cur.realWord.length()].listIterator();
while (iterator.hasNext()) {
Vertex vertex = iterator.next();
if (vertex == cur)
iterator.remove();
}
}
Aggregations