Search in sources :

Example 6 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class AhoCorasickDoubleArrayTrieSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    if (trie == null) {
        logger.warning("还未加载任何词典");
        return Collections.emptyList();
    }
    final int[] wordNet = new int[sentence.length];
    Arrays.fill(wordNet, 1);
    final Nature[] natureArray = config.speechTagging ? new Nature[sentence.length] : null;
    trie.parseText(sentence, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {

        @Override
        public void hit(int begin, int end, CoreDictionary.Attribute value) {
            int length = end - begin;
            if (length > wordNet[begin]) {
                wordNet[begin] = length;
                if (config.speechTagging) {
                    natureArray[begin] = value.nature[0];
                }
            }
        }
    });
    LinkedList<Term> termList = new LinkedList<Term>();
    if (config.speechTagging) {
        for (int i = 0; i < natureArray.length; ) {
            if (natureArray[i] == null) {
                int j = i + 1;
                for (; j < natureArray.length; ++j) {
                    if (natureArray[j] != null)
                        break;
                }
                List<AtomNode> atomNodeList = quickAtomSegment(sentence, i, j);
                for (AtomNode atomNode : atomNodeList) {
                    if (atomNode.sWord.length() >= wordNet[i]) {
                        wordNet[i] = atomNode.sWord.length();
                        natureArray[i] = atomNode.getNature();
                        i += wordNet[i];
                    }
                }
                i = j;
            } else {
                ++i;
            }
        }
    }
    for (int i = 0; i < wordNet.length; ) {
        Term term = new Term(new String(sentence, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
        term.offset = i;
        termList.add(term);
        i += wordNet[i];
    }
    return termList;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Example 7 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class DemoCustomNature method main.

public static void main(String[] args) {
    // 对于系统中已有的词性,可以直接获取
    Nature pcNature = Nature.fromString("n");
    System.out.println(pcNature);
    // 此时系统中没有"电脑品牌"这个词性
    pcNature = Nature.fromString("电脑品牌");
    System.out.println(pcNature);
    // 我们可以动态添加一个
    pcNature = Nature.create("电脑品牌");
    System.out.println(pcNature);
    // 可以将它赋予到某个词语
    LexiconUtility.setAttribute("苹果电脑", pcNature);
    // 或者
    LexiconUtility.setAttribute("苹果电脑", "电脑品牌 1000");
    // 它们将在分词结果中生效
    List<Term> termList = HanLP.segment("苹果电脑可以运行开源阿尔法狗代码吗");
    System.out.println(termList);
    for (Term term : termList) {
        if (term.nature == pcNature)
            System.out.printf("找到了 [%s] : %s\n", pcNature, term.word);
    }
    // 还可以直接插入到用户词典
    CustomDictionary.insert("阿尔法狗", "科技名词 1024");
    // 依然支持隐马词性标注
    StandardTokenizer.SEGMENT.enablePartOfSpeechTagging(true);
    termList = HanLP.segment("苹果电脑可以运行开源阿尔法狗代码吗");
    System.out.println(termList);
    // 如果使用了动态词性之后任何类使用了switch(nature)语句,必须注册每个类:
    CustomNatureUtility.registerSwitchClass(DemoCustomNature.class);
    for (Term term : termList) {
        switch(term.nature) {
            case n:
                System.out.printf("找到了 [%s] : %s\n", "名词", term.word);
        }
    }
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term)

Example 8 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method SplitMiddleSlashFromDigitalWords.

//====================================================================
//如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
//那么将此“-”符号从当前词中分离出来。
//例如 “3-4 / 月”需要拆分成“3 / - / 4 / 月”
//====================================================================
private static void SplitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
    if (linkedArray.size() < 2)
        return;
    ListIterator<Vertex> listIterator = linkedArray.listIterator();
    Vertex next = listIterator.next();
    Vertex current = next;
    while (listIterator.hasNext()) {
        next = listIterator.next();
        //            System.out.println("current:" + current + " next:" + next);
        Nature currentNature = current.getNature();
        if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
            String[] param = current.realWord.split("-", 1);
            if (param.length == 2) {
                if (TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
                    current = current.copy();
                    current.realWord = param[0];
                    current.confirmNature(Nature.m);
                    listIterator.previous();
                    listIterator.previous();
                    listIterator.set(current);
                    listIterator.next();
                    listIterator.add(Vertex.newPunctuationInstance("-"));
                    listIterator.add(Vertex.newNumberInstance(param[1]));
                }
            }
        }
        current = next;
    }
//        logger.trace("杠号识别后:" + Graph.parseResult(linkedArray));
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Vertex(com.hankcs.hanlp.seg.common.Vertex)

Example 9 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class TestCustomDictionary method testCustomNature.

public void testCustomNature() throws Exception {
    Nature pcNature1 = Nature.create("电脑品牌");
    Nature pcNature2 = Nature.create("电脑品牌");
    assertEquals(pcNature1, pcNature2);
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature)

Example 10 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class WordNet method add.

/**
     * 添加顶点,由原子分词顶点添加
     *
     * @param line
     * @param atomSegment
     */
public void add(int line, List<AtomNode> atomSegment) {
    // 将原子部分存入m_segGraph
    int offset = 0;
    for (//Init the cost array
    AtomNode atomNode : //Init the cost array
    atomSegment) {
        //init the word
        String sWord = atomNode.sWord;
        Nature nature = Nature.n;
        int id = -1;
        switch(atomNode.nPOS) {
            case Predefine.CT_CHINESE:
                break;
            case Predefine.CT_INDEX:
            case Predefine.CT_NUM:
                nature = Nature.m;
                sWord = "未##数";
                id = CoreDictionary.M_WORD_ID;
                break;
            case Predefine.CT_DELIMITER:
            case Predefine.CT_OTHER:
                nature = Nature.w;
                break;
            case //12021-2129-3121
            Predefine.CT_SINGLE:
                nature = Nature.nx;
                sWord = "未##串";
                id = CoreDictionary.X_WORD_ID;
                break;
            default:
                break;
        }
        // 这些通用符的量级都在10万左右
        add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, 10000), id));
        offset += atomNode.sWord.length();
    }
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Aggregations

Nature (com.hankcs.hanlp.corpus.tag.Nature)14 Vertex (com.hankcs.hanlp.seg.common.Vertex)4 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)3 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)3 Term (com.hankcs.hanlp.seg.common.Term)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 ByteArray (com.hankcs.hanlp.corpus.io.ByteArray)2 LinkedList (java.util.LinkedList)2 DoubleArrayTrie (com.hankcs.hanlp.collection.trie.DoubleArrayTrie)1 EnumItem (com.hankcs.hanlp.corpus.dictionary.item.EnumItem)1 NT (com.hankcs.hanlp.corpus.tag.NT)1