Search in sources :

Example 1 with AtomNode

use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.

the class AhoCorasickDoubleArrayTrieSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    if (trie == null) {
        logger.warning("还未加载任何词典");
        return Collections.emptyList();
    }
    final int[] wordNet = new int[sentence.length];
    Arrays.fill(wordNet, 1);
    final Nature[] natureArray = config.speechTagging ? new Nature[sentence.length] : null;
    trie.parseText(sentence, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {

        @Override
        public void hit(int begin, int end, CoreDictionary.Attribute value) {
            int length = end - begin;
            if (length > wordNet[begin]) {
                wordNet[begin] = length;
                if (config.speechTagging) {
                    natureArray[begin] = value.nature[0];
                }
            }
        }
    });
    LinkedList<Term> termList = new LinkedList<Term>();
    if (config.speechTagging) {
        for (int i = 0; i < natureArray.length; ) {
            if (natureArray[i] == null) {
                int j = i + 1;
                for (; j < natureArray.length; ++j) {
                    if (natureArray[j] != null)
                        break;
                }
                List<AtomNode> atomNodeList = quickAtomSegment(sentence, i, j);
                for (AtomNode atomNode : atomNodeList) {
                    if (atomNode.sWord.length() >= wordNet[i]) {
                        wordNet[i] = atomNode.sWord.length();
                        natureArray[i] = atomNode.getNature();
                        i += wordNet[i];
                    }
                }
                i = j;
            } else {
                ++i;
            }
        }
    }
    for (int i = 0; i < wordNet.length; ) {
        Term term = new Term(new String(sentence, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
        term.offset = i;
        termList.add(term);
        i += wordNet[i];
    }
    return termList;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Example 2 with AtomNode

use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.

the class Segment method quickAtomSegment.

/**
     * 快速原子分词,希望用这个方法替换掉原来缓慢的方法
     *
     * @param charArray
     * @param start
     * @param end
     * @return
     */
protected static List<AtomNode> quickAtomSegment(char[] charArray, int start, int end) {
    List<AtomNode> atomNodeList = new LinkedList<AtomNode>();
    int offsetAtom = start;
    int preType = CharType.get(charArray[offsetAtom]);
    int curType;
    while (++offsetAtom < end) {
        curType = CharType.get(charArray[offsetAtom]);
        if (curType != preType) {
            // 浮点数识别
            if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM) {
                while (++offsetAtom < end) {
                    curType = CharType.get(charArray[offsetAtom]);
                    if (curType != CharType.CT_NUM)
                        break;
                }
            }
            atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
            start = offsetAtom;
        }
        preType = curType;
    }
    if (offsetAtom == end)
        atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
    return atomNodeList;
}
Also used : AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Example 3 with AtomNode

use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.

the class WordNet method add.

/**
     * 添加顶点,由原子分词顶点添加
     *
     * @param line
     * @param atomSegment
     */
public void add(int line, List<AtomNode> atomSegment) {
    // 将原子部分存入m_segGraph
    int offset = 0;
    for (//Init the cost array
    AtomNode atomNode : //Init the cost array
    atomSegment) {
        //init the word
        String sWord = atomNode.sWord;
        Nature nature = Nature.n;
        int id = -1;
        switch(atomNode.nPOS) {
            case Predefine.CT_CHINESE:
                break;
            case Predefine.CT_INDEX:
            case Predefine.CT_NUM:
                nature = Nature.m;
                sWord = "未##数";
                id = CoreDictionary.M_WORD_ID;
                break;
            case Predefine.CT_DELIMITER:
            case Predefine.CT_OTHER:
                nature = Nature.w;
                break;
            case //12021-2129-3121
            Predefine.CT_SINGLE:
                nature = Nature.nx;
                sWord = "未##串";
                id = CoreDictionary.X_WORD_ID;
                break;
            default:
                break;
        }
        // 这些通用符的量级都在10万左右
        add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, 10000), id));
        offset += atomNode.sWord.length();
    }
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Example 4 with AtomNode

use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.

the class DoubleArrayTrieSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    char[] charArray = sentence;
    final int[] wordNet = new int[charArray.length];
    Arrays.fill(wordNet, 1);
    final Nature[] natureArray = config.speechTagging ? new Nature[charArray.length] : null;
    DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(sentence, 0);
    while (searcher.next()) {
        int length = searcher.length;
        if (length > wordNet[searcher.begin]) {
            wordNet[searcher.begin] = length;
            if (config.speechTagging) {
                natureArray[searcher.begin] = searcher.value.nature[0];
            }
        }
    }
    if (config.useCustomDictionary) {
        CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {

            @Override
            public void hit(int begin, int end, CoreDictionary.Attribute value) {
                int length = end - begin;
                if (length > wordNet[begin]) {
                    wordNet[begin] = length;
                    if (config.speechTagging) {
                        natureArray[begin] = value.nature[0];
                    }
                }
            }
        });
    }
    LinkedList<Term> termList = new LinkedList<Term>();
    if (config.speechTagging) {
        for (int i = 0; i < natureArray.length; ) {
            if (natureArray[i] == null) {
                int j = i + 1;
                for (; j < natureArray.length; ++j) {
                    if (natureArray[j] != null)
                        break;
                }
                List<AtomNode> atomNodeList = quickAtomSegment(charArray, i, j);
                for (AtomNode atomNode : atomNodeList) {
                    if (atomNode.sWord.length() >= wordNet[i]) {
                        wordNet[i] = atomNode.sWord.length();
                        natureArray[i] = atomNode.getNature();
                        i += wordNet[i];
                    }
                }
                i = j;
            } else {
                ++i;
            }
        }
    }
    for (int i = 0; i < wordNet.length; ) {
        Term term = new Term(new String(charArray, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
        term.offset = i;
        termList.add(term);
        i += wordNet[i];
    }
    return termList;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) DoubleArrayTrie(com.hankcs.hanlp.collection.trie.DoubleArrayTrie) LinkedList(java.util.LinkedList) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Example 5 with AtomNode

use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.

the class Segment method simpleAtomSegment.

/**
     * 简易原子分词,将所有字放到一起作为一个词
     *
     * @param charArray
     * @param start
     * @param end
     * @return
     */
protected static List<AtomNode> simpleAtomSegment(char[] charArray, int start, int end) {
    List<AtomNode> atomNodeList = new LinkedList<AtomNode>();
    atomNodeList.add(new AtomNode(new String(charArray, start, end - start), Predefine.CT_LETTER));
    return atomNodeList;
}
Also used : AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Aggregations

AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)5 Nature (com.hankcs.hanlp.corpus.tag.Nature)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)2 Term (com.hankcs.hanlp.seg.common.Term)2 DoubleArrayTrie (com.hankcs.hanlp.collection.trie.DoubleArrayTrie)1 LinkedList (java.util.LinkedList)1