use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method combineByCustomDictionary.
/**
* 使用用户词典合并粗分结果,并将用户词语收集到全词图中
* @param vertexList 粗分结果
* @param wordNetAll 收集用户词语到全词图中
* @return 合并后的结果
*/
protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList, WordNet wordNetAll) {
Vertex[] wordNet = new Vertex[vertexList.size()];
vertexList.toArray(wordNet);
// DAT合并
int line = 1;
DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat;
for (int i = 0; i < wordNet.length; ++i) {
int state = 1;
state = dat.transition(wordNet[i].realWord, state);
if (state > 0) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = dat.output(state);
for (; to < wordNet.length; ++to) {
state = dat.transition(wordNet[to].realWord, state);
if (state < 0)
break;
CoreDictionary.Attribute output = dat.output(state);
if (output != null) {
value = output;
end = to + 1;
combineWords(wordNet, i, end, value);
wordNetAll.add(line, wordNet[i]);
}
}
if (value != null) {
line += wordNet[i].realWord.length();
i = end - 1;
}
} else {
line += wordNet[i].realWord.length();
}
}
// BinTrie合并
if (CustomDictionary.trie != null) {
line = 1;
for (int i = 0; i < wordNet.length; ++i) {
if (wordNet[i] == null)
continue;
BaseNode<CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
if (state != null) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = state.getValue();
for (; to < wordNet.length; ++to) {
if (wordNet[to] == null)
continue;
state = state.transition(wordNet[to].realWord.toCharArray(), 0);
if (state == null)
break;
if (state.getValue() != null) {
value = state.getValue();
end = to + 1;
combineWords(wordNet, i, end, value);
wordNetAll.add(line, wordNet[i]);
}
}
if (value != null) {
line += wordNet[i].realWord.length();
i = end - 1;
}
} else {
line += wordNet[i].realWord.length();
}
}
}
vertexList.clear();
for (Vertex vertex : wordNet) {
if (vertex != null)
vertexList.add(vertex);
}
return vertexList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method combineByCustomDictionary.
/**
* 使用用户词典合并粗分结果
* @param vertexList 粗分结果
* @return 合并后的结果
*/
protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList) {
Vertex[] wordNet = new Vertex[vertexList.size()];
vertexList.toArray(wordNet);
// DAT合并
DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat;
for (int i = 0; i < wordNet.length; ++i) {
int state = 1;
state = dat.transition(wordNet[i].realWord, state);
if (state > 0) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = dat.output(state);
for (; to < wordNet.length; ++to) {
state = dat.transition(wordNet[to].realWord, state);
if (state < 0)
break;
CoreDictionary.Attribute output = dat.output(state);
if (output != null) {
value = output;
end = to + 1;
}
}
if (value != null) {
combineWords(wordNet, i, end, value);
i = end - 1;
}
}
}
// BinTrie合并
if (CustomDictionary.trie != null) {
for (int i = 0; i < wordNet.length; ++i) {
if (wordNet[i] == null)
continue;
BaseNode<CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
if (state != null) {
int to = i + 1;
int end = to;
CoreDictionary.Attribute value = state.getValue();
for (; to < wordNet.length; ++to) {
if (wordNet[to] == null)
continue;
state = state.transition(wordNet[to].realWord.toCharArray(), 0);
if (state == null)
break;
if (state.getValue() != null) {
value = state.getValue();
end = to + 1;
}
}
if (value != null) {
combineWords(wordNet, i, end, value);
i = end - 1;
}
}
}
}
vertexList.clear();
for (Vertex vertex : wordNet) {
if (vertex != null)
vertexList.add(vertex);
}
return vertexList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method removeFromWordNet.
/**
* 将一个词语从词网中彻底抹除
* @param cur 词语
* @param wordNetAll 词网
* @param line 当前扫描的行数
* @param length 当前缓冲区的长度
*/
private static void removeFromWordNet(Vertex cur, WordNet wordNetAll, int line, int length) {
LinkedList<Vertex>[] vertexes = wordNetAll.getVertexes();
// 将其从wordNet中删除
for (Vertex vertex : vertexes[line + length]) {
if (vertex.from == cur)
vertex.from = null;
}
ListIterator<Vertex> iterator = vertexes[line + length - cur.realWord.length()].listIterator();
while (iterator.hasNext()) {
Vertex vertex = iterator.next();
if (vertex == cur)
iterator.remove();
}
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class SimpleGraph method viterbi.
public List<Vertex> viterbi() {
LinkedList<Vertex> vertexList = new LinkedList<Vertex>();
for (Vertex node : nodes[1]) {
node.updateFrom(nodes[0].getFirst());
}
for (int i = 1; i < nodes.length - 1; ++i) {
LinkedList<Vertex> nodeArray = nodes[i];
if (nodeArray == null)
continue;
for (Vertex node : nodeArray) {
if (node.from == null)
continue;
for (Vertex to : nodes[i + node.realWord.length()]) {
to.updateFrom(node);
}
}
}
Vertex from = nodes[nodes.length - 1].getFirst();
while (from != null) {
vertexList.addFirst(from);
from = from.from;
}
return vertexList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class ViterbiSegment method viterbi.
private static List<Vertex> viterbi(WordNet wordNet) {
// 避免生成对象,优化速度
LinkedList<Vertex>[] nodes = wordNet.getVertexes();
LinkedList<Vertex> vertexList = new LinkedList<Vertex>();
for (Vertex node : nodes[1]) {
node.updateFrom(nodes[0].getFirst());
}
for (int i = 1; i < nodes.length - 1; ++i) {
LinkedList<Vertex> nodeArray = nodes[i];
if (nodeArray == null)
continue;
for (Vertex node : nodeArray) {
if (node.from == null)
continue;
for (Vertex to : nodes[i + node.realWord.length()]) {
to.updateFrom(node);
}
}
}
Vertex from = nodes[nodes.length - 1].getFirst();
while (from != null) {
vertexList.addFirst(from);
from = from.from;
}
return vertexList;
}
Aggregations