Search in sources :

Example 1 with DoubleArrayTrie

use of com.hankcs.hanlp.collection.trie.DoubleArrayTrie in project HanLP by hankcs.

the class CRFDependencyParser method loadDat.

boolean loadDat(String path) {
    ByteArray byteArray = ByteArray.createByteArray(path);
    if (byteArray == null)
        return false;
    crfModel = new CRFModelForDependency(new DoubleArrayTrie<FeatureFunction>());
    return crfModel.load(byteArray);
}
Also used : ByteArray(com.hankcs.hanlp.corpus.io.ByteArray) DoubleArrayTrie(com.hankcs.hanlp.collection.trie.DoubleArrayTrie)

Example 2 with DoubleArrayTrie

use of com.hankcs.hanlp.collection.trie.DoubleArrayTrie in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method GenerateWordNet.

/**
     * 生成一元词网
     *
     * @param wordNetStorage
     */
protected void GenerateWordNet(final WordNet wordNetStorage) {
    final char[] charArray = wordNetStorage.charArray;
    // 核心词典查询
    DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(charArray, 0);
    while (searcher.next()) {
        wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value, searcher.index));
    }
    // 用户词典查询
    //        if (config.useCustomDictionary)
    //        {
    //            searcher = CustomDictionary.dat.getSearcher(charArray, 0);
    //            while (searcher.next())
    //            {
    //                wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value));
    //            }
    //        }
    // 原子分词,保证图连通
    LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
    for (int i = 1; i < vertexes.length; ) {
        if (vertexes[i].isEmpty()) {
            int j = i + 1;
            for (; j < vertexes.length - 1; ++j) {
                if (!vertexes[j].isEmpty())
                    break;
            }
            wordNetStorage.add(i, quickAtomSegment(charArray, i - 1, j - 1));
            i = j;
        } else
            i += vertexes[i].getLast().realWord.length();
    }
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) DoubleArrayTrie(com.hankcs.hanlp.collection.trie.DoubleArrayTrie)

Example 3 with DoubleArrayTrie

use of com.hankcs.hanlp.collection.trie.DoubleArrayTrie in project HanLP by hankcs.

the class DoubleArrayTrieSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    char[] charArray = sentence;
    final int[] wordNet = new int[charArray.length];
    Arrays.fill(wordNet, 1);
    final Nature[] natureArray = config.speechTagging ? new Nature[charArray.length] : null;
    DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(sentence, 0);
    while (searcher.next()) {
        int length = searcher.length;
        if (length > wordNet[searcher.begin]) {
            wordNet[searcher.begin] = length;
            if (config.speechTagging) {
                natureArray[searcher.begin] = searcher.value.nature[0];
            }
        }
    }
    if (config.useCustomDictionary) {
        CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {

            @Override
            public void hit(int begin, int end, CoreDictionary.Attribute value) {
                int length = end - begin;
                if (length > wordNet[begin]) {
                    wordNet[begin] = length;
                    if (config.speechTagging) {
                        natureArray[begin] = value.nature[0];
                    }
                }
            }
        });
    }
    LinkedList<Term> termList = new LinkedList<Term>();
    if (config.speechTagging) {
        for (int i = 0; i < natureArray.length; ) {
            if (natureArray[i] == null) {
                int j = i + 1;
                for (; j < natureArray.length; ++j) {
                    if (natureArray[j] != null)
                        break;
                }
                List<AtomNode> atomNodeList = quickAtomSegment(charArray, i, j);
                for (AtomNode atomNode : atomNodeList) {
                    if (atomNode.sWord.length() >= wordNet[i]) {
                        wordNet[i] = atomNode.sWord.length();
                        natureArray[i] = atomNode.getNature();
                        i += wordNet[i];
                    }
                }
                i = j;
            } else {
                ++i;
            }
        }
    }
    for (int i = 0; i < wordNet.length; ) {
        Term term = new Term(new String(charArray, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
        term.offset = i;
        termList.add(term);
        i += wordNet[i];
    }
    return termList;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) DoubleArrayTrie(com.hankcs.hanlp.collection.trie.DoubleArrayTrie) LinkedList(java.util.LinkedList) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Aggregations

DoubleArrayTrie (com.hankcs.hanlp.collection.trie.DoubleArrayTrie)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 ByteArray (com.hankcs.hanlp.corpus.io.ByteArray)1 Nature (com.hankcs.hanlp.corpus.tag.Nature)1 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)1 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)1 Term (com.hankcs.hanlp.seg.common.Term)1 Vertex (com.hankcs.hanlp.seg.common.Vertex)1 LinkedList (java.util.LinkedList)1