Search in sources :

Example 11 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class Segment method combineByCustomDictionary.

/**
     * 使用用户词典合并粗分结果,并将用户词语收集到全词图中
     * @param vertexList 粗分结果
     * @param wordNetAll 收集用户词语到全词图中
     * @return 合并后的结果
     */
protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList, WordNet wordNetAll) {
    Vertex[] wordNet = new Vertex[vertexList.size()];
    vertexList.toArray(wordNet);
    // DAT合并
    int line = 1;
    DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat;
    for (int i = 0; i < wordNet.length; ++i) {
        int state = 1;
        state = dat.transition(wordNet[i].realWord, state);
        if (state > 0) {
            int to = i + 1;
            int end = to;
            CoreDictionary.Attribute value = dat.output(state);
            for (; to < wordNet.length; ++to) {
                state = dat.transition(wordNet[to].realWord, state);
                if (state < 0)
                    break;
                CoreDictionary.Attribute output = dat.output(state);
                if (output != null) {
                    value = output;
                    end = to + 1;
                    combineWords(wordNet, i, end, value);
                    wordNetAll.add(line, wordNet[i]);
                }
            }
            if (value != null) {
                line += wordNet[i].realWord.length();
                i = end - 1;
            }
        } else {
            line += wordNet[i].realWord.length();
        }
    }
    // BinTrie合并
    if (CustomDictionary.trie != null) {
        line = 1;
        for (int i = 0; i < wordNet.length; ++i) {
            if (wordNet[i] == null)
                continue;
            BaseNode<CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
            if (state != null) {
                int to = i + 1;
                int end = to;
                CoreDictionary.Attribute value = state.getValue();
                for (; to < wordNet.length; ++to) {
                    if (wordNet[to] == null)
                        continue;
                    state = state.transition(wordNet[to].realWord.toCharArray(), 0);
                    if (state == null)
                        break;
                    if (state.getValue() != null) {
                        value = state.getValue();
                        end = to + 1;
                        combineWords(wordNet, i, end, value);
                        wordNetAll.add(line, wordNet[i]);
                    }
                }
                if (value != null) {
                    line += wordNet[i].realWord.length();
                    i = end - 1;
                }
            } else {
                line += wordNet[i].realWord.length();
            }
        }
    }
    vertexList.clear();
    for (Vertex vertex : wordNet) {
        if (vertex != null)
            vertexList.add(vertex);
    }
    return vertexList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary)

Example 12 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class Segment method combineByCustomDictionary.

/**
     * 使用用户词典合并粗分结果
     * @param vertexList 粗分结果
     * @return 合并后的结果
     */
protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList) {
    Vertex[] wordNet = new Vertex[vertexList.size()];
    vertexList.toArray(wordNet);
    // DAT合并
    DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat;
    for (int i = 0; i < wordNet.length; ++i) {
        int state = 1;
        state = dat.transition(wordNet[i].realWord, state);
        if (state > 0) {
            int to = i + 1;
            int end = to;
            CoreDictionary.Attribute value = dat.output(state);
            for (; to < wordNet.length; ++to) {
                state = dat.transition(wordNet[to].realWord, state);
                if (state < 0)
                    break;
                CoreDictionary.Attribute output = dat.output(state);
                if (output != null) {
                    value = output;
                    end = to + 1;
                }
            }
            if (value != null) {
                combineWords(wordNet, i, end, value);
                i = end - 1;
            }
        }
    }
    // BinTrie合并
    if (CustomDictionary.trie != null) {
        for (int i = 0; i < wordNet.length; ++i) {
            if (wordNet[i] == null)
                continue;
            BaseNode<CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
            if (state != null) {
                int to = i + 1;
                int end = to;
                CoreDictionary.Attribute value = state.getValue();
                for (; to < wordNet.length; ++to) {
                    if (wordNet[to] == null)
                        continue;
                    state = state.transition(wordNet[to].realWord.toCharArray(), 0);
                    if (state == null)
                        break;
                    if (state.getValue() != null) {
                        value = state.getValue();
                        end = to + 1;
                    }
                }
                if (value != null) {
                    combineWords(wordNet, i, end, value);
                    i = end - 1;
                }
            }
        }
    }
    vertexList.clear();
    for (Vertex vertex : wordNet) {
        if (vertex != null)
            vertexList.add(vertex);
    }
    return vertexList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary)

Example 13 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class Segment method removeFromWordNet.

/**
     * 将一个词语从词网中彻底抹除
     * @param cur 词语
     * @param wordNetAll 词网
     * @param line 当前扫描的行数
     * @param length 当前缓冲区的长度
     */
private static void removeFromWordNet(Vertex cur, WordNet wordNetAll, int line, int length) {
    LinkedList<Vertex>[] vertexes = wordNetAll.getVertexes();
    // 将其从wordNet中删除
    for (Vertex vertex : vertexes[line + length]) {
        if (vertex.from == cur)
            vertex.from = null;
    }
    ListIterator<Vertex> iterator = vertexes[line + length - cur.realWord.length()].listIterator();
    while (iterator.hasNext()) {
        Vertex vertex = iterator.next();
        if (vertex == cur)
            iterator.remove();
    }
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex)

Example 14 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class SimpleGraph method viterbi.

public List<Vertex> viterbi() {
    LinkedList<Vertex> vertexList = new LinkedList<Vertex>();
    for (Vertex node : nodes[1]) {
        node.updateFrom(nodes[0].getFirst());
    }
    for (int i = 1; i < nodes.length - 1; ++i) {
        LinkedList<Vertex> nodeArray = nodes[i];
        if (nodeArray == null)
            continue;
        for (Vertex node : nodeArray) {
            if (node.from == null)
                continue;
            for (Vertex to : nodes[i + node.realWord.length()]) {
                to.updateFrom(node);
            }
        }
    }
    Vertex from = nodes[nodes.length - 1].getFirst();
    while (from != null) {
        vertexList.addFirst(from);
        from = from.from;
    }
    return vertexList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) LinkedList(java.util.LinkedList)

Example 15 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class ViterbiSegment method viterbi.

private static List<Vertex> viterbi(WordNet wordNet) {
    // 避免生成对象,优化速度
    LinkedList<Vertex>[] nodes = wordNet.getVertexes();
    LinkedList<Vertex> vertexList = new LinkedList<Vertex>();
    for (Vertex node : nodes[1]) {
        node.updateFrom(nodes[0].getFirst());
    }
    for (int i = 1; i < nodes.length - 1; ++i) {
        LinkedList<Vertex> nodeArray = nodes[i];
        if (nodeArray == null)
            continue;
        for (Vertex node : nodeArray) {
            if (node.from == null)
                continue;
            for (Vertex to : nodes[i + node.realWord.length()]) {
                to.updateFrom(node);
            }
        }
    }
    Vertex from = nodes[nodes.length - 1].getFirst();
    while (from != null) {
        vertexList.addFirst(from);
        from = from.from;
    }
    return vertexList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) LinkedList(java.util.LinkedList)

Aggregations

Vertex (com.hankcs.hanlp.seg.common.Vertex)33 EnumItem (com.hankcs.hanlp.corpus.dictionary.item.EnumItem)6 LinkedList (java.util.LinkedList)6 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)5 Term (com.hankcs.hanlp.seg.common.Term)5 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)4 NS (com.hankcs.hanlp.corpus.tag.NS)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)4 NR (com.hankcs.hanlp.corpus.tag.NR)3 NT (com.hankcs.hanlp.corpus.tag.NT)3 Graph (com.hankcs.hanlp.seg.common.Graph)2 WordNet (com.hankcs.hanlp.seg.common.WordNet)2 DoubleArrayTrie (com.hankcs.hanlp.collection.trie.DoubleArrayTrie)1 CharTable (com.hankcs.hanlp.dictionary.other.CharTable)1 Table (com.hankcs.hanlp.model.crf.Table)1 State (com.hankcs.hanlp.seg.Dijkstra.Path.State)1 EdgeFrom (com.hankcs.hanlp.seg.common.EdgeFrom)1 List (java.util.List)1 PriorityQueue (java.util.PriorityQueue)1