Search in sources :

Example 21 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class Segment method combineWords.

/**
     * 将连续的词语合并为一个
     * @param wordNet 词图
     * @param start 起始下标(包含)
     * @param end 结束下标(不包含)
     * @param value 新的属性
     */
private static void combineWords(Vertex[] wordNet, int start, int end, CoreDictionary.Attribute value) {
    if (// 小优化,如果只有一个词,那就不需要合并,直接应用新属性
    start + 1 == end) {
        wordNet[start].attribute = value;
    } else {
        StringBuilder sbTerm = new StringBuilder();
        for (int j = start; j < end; ++j) {
            if (wordNet[j] == null)
                continue;
            String realWord = wordNet[j].realWord;
            sbTerm.append(realWord);
            wordNet[j] = null;
        }
        wordNet[start] = new Vertex(sbTerm.toString(), value);
    }
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex)

Example 22 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class ViterbiSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    //        long start = System.currentTimeMillis();
    WordNet wordNetAll = new WordNet(sentence);
    ////////////////生成词网////////////////////
    GenerateWordNet(wordNetAll);
    //        System.out.println("构图:" + (System.currentTimeMillis() - start));
    if (HanLP.Config.DEBUG) {
        System.out.printf("粗分词网:\n%s\n", wordNetAll);
    }
    //        start = System.currentTimeMillis();
    List<Vertex> vertexList = viterbi(wordNetAll);
    if (config.useCustomDictionary) {
        if (config.indexMode)
            combineByCustomDictionary(vertexList, wordNetAll);
        else
            combineByCustomDictionary(vertexList);
    }
    if (HanLP.Config.DEBUG) {
        System.out.println("粗分结果" + convert(vertexList, false));
    }
    // 数字识别
    if (config.numberQuantifierRecognize) {
        mergeNumberQuantifier(vertexList, wordNetAll, config);
    }
    // 实体命名识别
    if (config.ner) {
        WordNet wordNetOptimum = new WordNet(sentence, vertexList);
        int preSize = wordNetOptimum.size();
        if (config.nameRecognize) {
            PersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
        }
        if (config.translatedNameRecognize) {
            TranslatedPersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
        }
        if (config.japaneseNameRecognize) {
            JapanesePersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
        }
        if (config.placeRecognize) {
            PlaceRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
        }
        if (config.organizationRecognize) {
            // 层叠隐马模型——生成输出作为下一级隐马输入
            vertexList = viterbi(wordNetOptimum);
            wordNetOptimum.clear();
            wordNetOptimum.addAll(vertexList);
            preSize = wordNetOptimum.size();
            OrganizationRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
        }
        if (wordNetOptimum.size() != preSize) {
            vertexList = viterbi(wordNetOptimum);
            if (HanLP.Config.DEBUG) {
                System.out.printf("细分词网:\n%s\n", wordNetOptimum);
            }
        }
    }
    // 如果是索引模式则全切分
    if (config.indexMode) {
        return decorateResultForIndexMode(vertexList, wordNetAll);
    }
    // 是否标注词性
    if (config.speechTagging) {
        speechTagging(vertexList);
    }
    return convert(vertexList, config.offset);
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) WordNet(com.hankcs.hanlp.seg.common.WordNet)

Example 23 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method mergeContinueNumIntoOne.

/**
     * 将连续的数字节点合并为一个
     *
     * @param linkedArray
     */
private static void mergeContinueNumIntoOne(List<Vertex> linkedArray) {
    if (linkedArray.size() < 2)
        return;
    ListIterator<Vertex> listIterator = linkedArray.listIterator();
    Vertex next = listIterator.next();
    Vertex current = next;
    while (listIterator.hasNext()) {
        next = listIterator.next();
        //            System.out.println("current:" + current + " next:" + next);
        if ((TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) && (TextUtility.isAllNum(next.realWord) || TextUtility.isAllChineseNum(next.realWord))) {
            /////////// 这部分从逻辑上等同于current.realWord = current.realWord + next.realWord;
            // 但是current指针被几个路径共享,需要备份,不然修改了一处就修改了全局
            current = Vertex.newNumberInstance(current.realWord + next.realWord);
            listIterator.previous();
            listIterator.previous();
            listIterator.set(current);
            listIterator.next();
            listIterator.next();
            /////////// end 这部分
            //                System.out.println("before:" + linkedArray);
            listIterator.remove();
        //                System.out.println("after:" + linkedArray);
        } else {
            current = next;
        }
    }
//        logger.trace("数字识别后:" + Graph.parseResult(linkedArray));
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex)

Example 24 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method CheckDateElements.

//====================================================================
//1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
//2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
//3、如果最后一个汉字是"点" ,则认为当前数字是时间
//4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
//5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
//====================================================================
private static void CheckDateElements(List<Vertex> linkedArray) {
    if (linkedArray.size() < 2)
        return;
    ListIterator<Vertex> listIterator = linkedArray.listIterator();
    Vertex next = listIterator.next();
    Vertex current = next;
    while (listIterator.hasNext()) {
        next = listIterator.next();
        if (TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) {
            //===== 1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
            String nextWord = next.realWord;
            if ((nextWord.length() == 1 && "月日时分秒".contains(nextWord)) || (nextWord.length() == 2 && nextWord.equals("月份"))) {
                current = Vertex.newTimeInstance(current.realWord + next.realWord);
                listIterator.previous();
                listIterator.previous();
                listIterator.set(current);
                listIterator.next();
                listIterator.next();
                listIterator.remove();
            } else //===== 2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
            if (nextWord.equals("年")) {
                if (TextUtility.isYearTime(current.realWord)) {
                    current = Vertex.newTimeInstance(current.realWord + next.realWord);
                    listIterator.previous();
                    listIterator.previous();
                    listIterator.set(current);
                    listIterator.next();
                    listIterator.next();
                    listIterator.remove();
                } else //===== 否则当前词就是数字了 =====
                {
                    current.confirmNature(Nature.m);
                }
            } else {
                //===== 3、如果最后一个汉字是"点" ,则认为当前数字是时间
                if (current.realWord.endsWith("点")) {
                    current.confirmNature(Nature.t, true);
                } else {
                    char[] tmpCharArray = current.realWord.toCharArray();
                    String lastChar = String.valueOf(tmpCharArray[tmpCharArray.length - 1]);
                    //===== 4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
                    if (!"∶·././".contains(lastChar)) {
                        current.confirmNature(Nature.m, true);
                    } else //===== 5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
                    if (current.realWord.length() > 1) {
                        char last = current.realWord.charAt(current.realWord.length() - 1);
                        current = Vertex.newNumberInstance(current.realWord.substring(0, current.realWord.length() - 1));
                        listIterator.previous();
                        listIterator.previous();
                        listIterator.set(current);
                        listIterator.next();
                        listIterator.add(Vertex.newPunctuationInstance(String.valueOf(last)));
                    }
                }
            }
        }
        current = next;
    }
//        logger.trace("日期识别后:" + Graph.parseResult(linkedArray));
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex)

Example 25 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class NShortSegment method segSentence.

@Override
public List<Term> segSentence(char[] sentence) {
    WordNet wordNetOptimum = new WordNet(sentence);
    WordNet wordNetAll = new WordNet(sentence);
    //        char[] charArray = text.toCharArray();
    // 粗分
    List<List<Vertex>> coarseResult = BiSegment(sentence, 2, wordNetOptimum, wordNetAll);
    boolean NERexists = false;
    for (List<Vertex> vertexList : coarseResult) {
        if (HanLP.Config.DEBUG) {
            System.out.println("粗分结果" + convert(vertexList, false));
        }
        // 实体命名识别
        if (config.ner) {
            wordNetOptimum.addAll(vertexList);
            int preSize = wordNetOptimum.size();
            if (config.nameRecognize) {
                PersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
            }
            if (config.translatedNameRecognize) {
                TranslatedPersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
            }
            if (config.japaneseNameRecognize) {
                JapanesePersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
            }
            if (config.placeRecognize) {
                PlaceRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
            }
            if (config.organizationRecognize) {
                // 层叠隐马模型——生成输出作为下一级隐马输入
                vertexList = Dijkstra.compute(GenerateBiGraph(wordNetOptimum));
                wordNetOptimum.addAll(vertexList);
                OrganizationRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
            }
            if (!NERexists && preSize != wordNetOptimum.size()) {
                NERexists = true;
            }
        }
    }
    List<Vertex> vertexList = coarseResult.get(0);
    if (NERexists) {
        Graph graph = GenerateBiGraph(wordNetOptimum);
        vertexList = Dijkstra.compute(graph);
        if (HanLP.Config.DEBUG) {
            System.out.printf("细分词网:\n%s\n", wordNetOptimum);
            System.out.printf("细分词图:%s\n", graph.printByTo());
        }
    }
    // 数字识别
    if (config.numberQuantifierRecognize) {
        mergeNumberQuantifier(vertexList, wordNetAll, config);
    }
    // 如果是索引模式则全切分
    if (config.indexMode) {
        return decorateResultForIndexMode(vertexList, wordNetAll);
    }
    // 是否标注词性
    if (config.speechTagging) {
        speechTagging(vertexList);
    }
    if (config.useCustomDictionary) {
        if (config.indexMode)
            combineByCustomDictionary(vertexList, wordNetAll);
        else
            combineByCustomDictionary(vertexList);
    }
    return convert(vertexList, config.offset);
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) Graph(com.hankcs.hanlp.seg.common.Graph) WordNet(com.hankcs.hanlp.seg.common.WordNet)

Aggregations

Vertex (com.hankcs.hanlp.seg.common.Vertex)33 EnumItem (com.hankcs.hanlp.corpus.dictionary.item.EnumItem)6 LinkedList (java.util.LinkedList)6 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)5 Term (com.hankcs.hanlp.seg.common.Term)5 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)4 NS (com.hankcs.hanlp.corpus.tag.NS)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)4 NR (com.hankcs.hanlp.corpus.tag.NR)3 NT (com.hankcs.hanlp.corpus.tag.NT)3 Graph (com.hankcs.hanlp.seg.common.Graph)2 WordNet (com.hankcs.hanlp.seg.common.WordNet)2 DoubleArrayTrie (com.hankcs.hanlp.collection.trie.DoubleArrayTrie)1 CharTable (com.hankcs.hanlp.dictionary.other.CharTable)1 Table (com.hankcs.hanlp.model.crf.Table)1 State (com.hankcs.hanlp.seg.Dijkstra.Path.State)1 EdgeFrom (com.hankcs.hanlp.seg.common.EdgeFrom)1 List (java.util.List)1 PriorityQueue (java.util.PriorityQueue)1