Search in sources :

Example 16 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method convert.

/**
     * 将一条路径转为最终结果
     *
     * @param vertexList
     * @param offsetEnabled 是否计算offset
     * @return
     */
protected static List<Term> convert(List<Vertex> vertexList, boolean offsetEnabled) {
    assert vertexList != null;
    assert vertexList.size() >= 2 : "这条路径不应当短于2" + vertexList.toString();
    int length = vertexList.size() - 2;
    List<Term> resultList = new ArrayList<Term>(length);
    Iterator<Vertex> iterator = vertexList.iterator();
    iterator.next();
    if (offsetEnabled) {
        int offset = 0;
        for (int i = 0; i < length; ++i) {
            Vertex vertex = iterator.next();
            Term term = convert(vertex);
            term.offset = offset;
            offset += term.length();
            resultList.add(term);
        }
    } else {
        for (int i = 0; i < length; ++i) {
            Vertex vertex = iterator.next();
            Term term = convert(vertex);
            resultList.add(term);
        }
    }
    return resultList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) Term(com.hankcs.hanlp.seg.common.Term)

Example 17 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method SplitMiddleSlashFromDigitalWords.

//====================================================================
//如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
//那么将此“-”符号从当前词中分离出来。
//例如 “3-4 / 月”需要拆分成“3 / - / 4 / 月”
//====================================================================
private static void SplitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
    if (linkedArray.size() < 2)
        return;
    ListIterator<Vertex> listIterator = linkedArray.listIterator();
    Vertex next = listIterator.next();
    Vertex current = next;
    while (listIterator.hasNext()) {
        next = listIterator.next();
        //            System.out.println("current:" + current + " next:" + next);
        Nature currentNature = current.getNature();
        if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
            String[] param = current.realWord.split("-", 1);
            if (param.length == 2) {
                if (TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
                    current = current.copy();
                    current.realWord = param[0];
                    current.confirmNature(Nature.m);
                    listIterator.previous();
                    listIterator.previous();
                    listIterator.set(current);
                    listIterator.next();
                    listIterator.add(Vertex.newPunctuationInstance("-"));
                    listIterator.add(Vertex.newNumberInstance(param[1]));
                }
            }
        }
        current = next;
    }
//        logger.trace("杠号识别后:" + Graph.parseResult(linkedArray));
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Vertex(com.hankcs.hanlp.seg.common.Vertex)

Example 18 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method decorateResultForIndexMode.

/**
     * 为了索引模式修饰结果
     *
     * @param vertexList
     * @param wordNetAll
     */
protected static List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll) {
    List<Term> termList = new LinkedList<Term>();
    int line = 1;
    ListIterator<Vertex> listIterator = vertexList.listIterator();
    listIterator.next();
    int length = vertexList.size() - 2;
    for (int i = 0; i < length; ++i) {
        Vertex vertex = listIterator.next();
        Term termMain = convert(vertex);
        termList.add(termMain);
        termMain.offset = line - 1;
        if (vertex.realWord.length() > 2) {
            // 过长词所在的行
            int currentLine = line;
            while (currentLine < line + vertex.realWord.length()) {
                // 这一行的词
                List<Vertex> vertexListCurrentLine = wordNetAll.get(currentLine);
                for (// 这一行的短词
                Vertex smallVertex : // 这一行的短词
                vertexListCurrentLine) {
                    if (((termMain.nature == Nature.mq && smallVertex.hasNature(Nature.q)) || smallVertex.realWord.length() > 1) && smallVertex != vertex) {
                        listIterator.add(smallVertex);
                        Term termSub = convert(smallVertex);
                        termSub.offset = currentLine - 1;
                        termList.add(termSub);
                    }
                }
                ++currentLine;
            }
        }
        line += vertex.realWord.length();
    }
    return termList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) Term(com.hankcs.hanlp.seg.common.Term)

Example 19 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class WordBasedGenerativeModelSegment method GenerateWordNet.

/**
     * 生成一元词网
     *
     * @param wordNetStorage
     */
protected void GenerateWordNet(final WordNet wordNetStorage) {
    final char[] charArray = wordNetStorage.charArray;
    // 核心词典查询
    DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(charArray, 0);
    while (searcher.next()) {
        wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value, searcher.index));
    }
    // 用户词典查询
    //        if (config.useCustomDictionary)
    //        {
    //            searcher = CustomDictionary.dat.getSearcher(charArray, 0);
    //            while (searcher.next())
    //            {
    //                wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value));
    //            }
    //        }
    // 原子分词,保证图连通
    LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
    for (int i = 1; i < vertexes.length; ) {
        if (vertexes[i].isEmpty()) {
            int j = i + 1;
            for (; j < vertexes.length - 1; ++j) {
                if (!vertexes[j].isEmpty())
                    break;
            }
            wordNetStorage.add(i, quickAtomSegment(charArray, i - 1, j - 1));
            i = j;
        } else
            i += vertexes[i].getLast().realWord.length();
    }
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) DoubleArrayTrie(com.hankcs.hanlp.collection.trie.DoubleArrayTrie)

Example 20 with Vertex

use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.

the class Segment method mergeNumberQuantifier.

/**
     * 合并数字
     * @param termList
     */
protected void mergeNumberQuantifier(List<Vertex> termList, WordNet wordNetAll, Config config) {
    if (termList.size() < 4)
        return;
    StringBuilder sbQuantifier = new StringBuilder();
    ListIterator<Vertex> iterator = termList.listIterator();
    iterator.next();
    int line = 1;
    while (iterator.hasNext()) {
        Vertex pre = iterator.next();
        if (pre.hasNature(Nature.m)) {
            sbQuantifier.append(pre.realWord);
            Vertex cur = null;
            while (iterator.hasNext() && (cur = iterator.next()).hasNature(Nature.m)) {
                sbQuantifier.append(cur.realWord);
                iterator.remove();
                removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length());
            }
            if (cur != null) {
                if ((cur.hasNature(Nature.q) || cur.hasNature(Nature.qv) || cur.hasNature(Nature.qt))) {
                    if (config.indexMode) {
                        wordNetAll.add(line, new Vertex(sbQuantifier.toString(), new CoreDictionary.Attribute(Nature.m)));
                    }
                    sbQuantifier.append(cur.realWord);
                    iterator.remove();
                    removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length());
                } else {
                    // (cur = iterator.next()).hasNature(Nature.m) 最后一个next可能不含q词性
                    line += cur.realWord.length();
                }
            }
            if (sbQuantifier.length() != pre.realWord.length()) {
                pre.realWord = sbQuantifier.toString();
                pre.word = Predefine.TAG_NUMBER;
                pre.attribute = new CoreDictionary.Attribute(Nature.mq);
                pre.wordID = CoreDictionary.M_WORD_ID;
                sbQuantifier.setLength(0);
            }
        }
        sbQuantifier.setLength(0);
        line += pre.realWord.length();
    }
//        System.out.println(wordNetAll);
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary)

Aggregations

Vertex (com.hankcs.hanlp.seg.common.Vertex)33 EnumItem (com.hankcs.hanlp.corpus.dictionary.item.EnumItem)6 LinkedList (java.util.LinkedList)6 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)5 Term (com.hankcs.hanlp.seg.common.Term)5 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)4 NS (com.hankcs.hanlp.corpus.tag.NS)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)4 NR (com.hankcs.hanlp.corpus.tag.NR)3 NT (com.hankcs.hanlp.corpus.tag.NT)3 Graph (com.hankcs.hanlp.seg.common.Graph)2 WordNet (com.hankcs.hanlp.seg.common.WordNet)2 DoubleArrayTrie (com.hankcs.hanlp.collection.trie.DoubleArrayTrie)1 CharTable (com.hankcs.hanlp.dictionary.other.CharTable)1 Table (com.hankcs.hanlp.model.crf.Table)1 State (com.hankcs.hanlp.seg.Dijkstra.Path.State)1 EdgeFrom (com.hankcs.hanlp.seg.common.EdgeFrom)1 List (java.util.List)1 PriorityQueue (java.util.PriorityQueue)1