Search in sources :

Example 1 with NR

use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.

the class PersonDictionary method parsePattern.

/**
     * 模式匹配
     *
     * @param nrList         确定的标注序列
     * @param vertexList     原始的未加角色标注的序列
     * @param wordNetOptimum 待优化的图
     * @param wordNetAll     全词图
     */
public static void parsePattern(List<NR> nrList, List<Vertex> vertexList, final WordNet wordNetOptimum, final WordNet wordNetAll) {
    // 拆分UV
    ListIterator<Vertex> listIterator = vertexList.listIterator();
    StringBuilder sbPattern = new StringBuilder(nrList.size());
    NR preNR = NR.A;
    boolean backUp = false;
    int index = 0;
    for (NR nr : nrList) {
        ++index;
        Vertex current = listIterator.next();
        //            logger.trace("{}/{}", current.realWord, nr);
        switch(nr) {
            case U:
                if (!backUp) {
                    vertexList = new ArrayList<Vertex>(vertexList);
                    listIterator = vertexList.listIterator(index);
                    backUp = true;
                }
                sbPattern.append(NR.K.toString());
                sbPattern.append(NR.B.toString());
                preNR = B;
                listIterator.previous();
                String nowK = current.realWord.substring(0, current.realWord.length() - 1);
                String nowB = current.realWord.substring(current.realWord.length() - 1);
                listIterator.set(new Vertex(nowK));
                listIterator.next();
                listIterator.add(new Vertex(nowB));
                continue;
            case V:
                if (!backUp) {
                    vertexList = new ArrayList<Vertex>(vertexList);
                    listIterator = vertexList.listIterator(index);
                    backUp = true;
                }
                if (preNR == B) {
                    //BE
                    sbPattern.append(NR.E.toString());
                } else {
                    //CD
                    sbPattern.append(NR.D.toString());
                }
                sbPattern.append(NR.L.toString());
                // 对串也做一些修改
                listIterator.previous();
                String nowED = current.realWord.substring(current.realWord.length() - 1);
                String nowL = current.realWord.substring(0, current.realWord.length() - 1);
                listIterator.set(new Vertex(nowED));
                listIterator.add(new Vertex(nowL));
                listIterator.next();
                continue;
            default:
                sbPattern.append(nr.toString());
                break;
        }
        preNR = nr;
    }
    String pattern = sbPattern.toString();
    //        logger.trace("模式串:{}", pattern);
    //        logger.trace("对应串:{}", vertexList);
    //        if (pattern.length() != vertexList.size())
    //        {
    //            logger.warn("人名识别模式串有bug", pattern, vertexList);
    //            return;
    //        }
    final Vertex[] wordArray = vertexList.toArray(new Vertex[0]);
    final int[] offsetArray = new int[wordArray.length];
    offsetArray[0] = 0;
    for (int i = 1; i < wordArray.length; ++i) {
        offsetArray[i] = offsetArray[i - 1] + wordArray[i - 1].realWord.length();
    }
    trie.parseText(pattern, new AhoCorasickDoubleArrayTrie.IHit<NRPattern>() {

        @Override
        public void hit(int begin, int end, NRPattern value) {
            //            logger.trace("匹配到:{}", keyword);
            StringBuilder sbName = new StringBuilder();
            for (int i = begin; i < end; ++i) {
                sbName.append(wordArray[i].realWord);
            }
            String name = sbName.toString();
            // 对一些bad case做出调整
            switch(value) {
                case BCD:
                    // 姓和最后一个名不可能相等的
                    if (name.charAt(0) == name.charAt(2))
                        return;
                    //                        }
                    break;
            }
            if (isBadCase(name))
                return;
            // 正式算它是一个名字
            if (HanLP.Config.DEBUG) {
                System.out.printf("识别出人名:%s %s\n", name, value);
            }
            int offset = offsetArray[begin];
            wordNetOptimum.insert(offset, new Vertex(Predefine.TAG_PEOPLE, name, ATTRIBUTE, WORD_ID), wordNetAll);
        }
    });
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) NR(com.hankcs.hanlp.corpus.tag.NR) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)

Example 2 with NR

use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.

the class PersonRecognition method roleObserve.

/**
     * 角色观察(从模型中加载所有词语对应的所有角色,允许进行一些规则补充)
     * @param wordSegResult 粗分结果
     * @return
     */
public static List<EnumItem<NR>> roleObserve(List<Vertex> wordSegResult) {
    List<EnumItem<NR>> tagList = new LinkedList<EnumItem<NR>>();
    for (Vertex vertex : wordSegResult) {
        EnumItem<NR> nrEnumItem = PersonDictionary.dictionary.get(vertex.realWord);
        if (nrEnumItem == null) {
            switch(vertex.guessNature()) {
                case nr:
                    {
                        // 有些双名实际上可以构成更长的三名
                        if (vertex.getAttribute().totalFrequency <= 1000 && vertex.realWord.length() == 2) {
                            nrEnumItem = new EnumItem<NR>(NR.X, NR.G);
                        } else
                            nrEnumItem = new EnumItem<NR>(NR.A, PersonDictionary.transformMatrixDictionary.getTotalFrequency(NR.A));
                    }
                    break;
                case nnt:
                    {
                        // 姓+职位
                        nrEnumItem = new EnumItem<NR>(NR.G, NR.K);
                    }
                    break;
                default:
                    {
                        nrEnumItem = new EnumItem<NR>(NR.A, PersonDictionary.transformMatrixDictionary.getTotalFrequency(NR.A));
                    }
                    break;
            }
        }
        tagList.add(nrEnumItem);
    }
    return tagList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) NR(com.hankcs.hanlp.corpus.tag.NR) EnumItem(com.hankcs.hanlp.corpus.dictionary.item.EnumItem) LinkedList(java.util.LinkedList)

Example 3 with NR

use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.

the class PersonRecognition method Recognition.

public static boolean Recognition(List<Vertex> pWordSegResult, WordNet wordNetOptimum, WordNet wordNetAll) {
    List<EnumItem<NR>> roleTagList = roleObserve(pWordSegResult);
    if (HanLP.Config.DEBUG) {
        StringBuilder sbLog = new StringBuilder();
        Iterator<Vertex> iterator = pWordSegResult.iterator();
        for (EnumItem<NR> nrEnumItem : roleTagList) {
            sbLog.append('[');
            sbLog.append(iterator.next().realWord);
            sbLog.append(' ');
            sbLog.append(nrEnumItem);
            sbLog.append(']');
        }
        System.out.printf("人名角色观察:%s\n", sbLog.toString());
    }
    List<NR> nrList = viterbiComputeSimply(roleTagList);
    if (HanLP.Config.DEBUG) {
        StringBuilder sbLog = new StringBuilder();
        Iterator<Vertex> iterator = pWordSegResult.iterator();
        sbLog.append('[');
        for (NR nr : nrList) {
            sbLog.append(iterator.next().realWord);
            sbLog.append('/');
            sbLog.append(nr);
            sbLog.append(" ,");
        }
        if (sbLog.length() > 1)
            sbLog.delete(sbLog.length() - 2, sbLog.length());
        sbLog.append(']');
        System.out.printf("人名角色标注:%s\n", sbLog.toString());
    }
    PersonDictionary.parsePattern(nrList, pWordSegResult, wordNetOptimum, wordNetAll);
    return true;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) NR(com.hankcs.hanlp.corpus.tag.NR) EnumItem(com.hankcs.hanlp.corpus.dictionary.item.EnumItem)

Example 4 with NR

use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.

the class NRDictionary method loadDat.

private EnumItem<NR>[] loadDat(String path) {
    byte[] bytes = IOUtil.readBytes(path);
    if (bytes == null)
        return null;
    NR[] nrArray = NR.values();
    int index = 0;
    int size = ByteUtil.bytesHighFirstToInt(bytes, index);
    index += 4;
    EnumItem<NR>[] valueArray = new EnumItem[size];
    for (int i = 0; i < size; ++i) {
        int currentSize = ByteUtil.bytesHighFirstToInt(bytes, index);
        index += 4;
        EnumItem<NR> item = new EnumItem<NR>();
        for (int j = 0; j < currentSize; ++j) {
            NR nr = nrArray[ByteUtil.bytesHighFirstToInt(bytes, index)];
            index += 4;
            int frequency = ByteUtil.bytesHighFirstToInt(bytes, index);
            index += 4;
            item.labelMap.put(nr, frequency);
        }
        valueArray[i] = item;
    }
    return valueArray;
}
Also used : NR(com.hankcs.hanlp.corpus.tag.NR) EnumItem(com.hankcs.hanlp.corpus.dictionary.item.EnumItem)

Aggregations

NR (com.hankcs.hanlp.corpus.tag.NR)4 EnumItem (com.hankcs.hanlp.corpus.dictionary.item.EnumItem)3 Vertex (com.hankcs.hanlp.seg.common.Vertex)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)1 LinkedList (java.util.LinkedList)1