Search in sources :

Example 1 with NS

use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.

the class PlaceDictionary method parsePattern.

/**
     * 模式匹配
     *
     * @param nsList         确定的标注序列
     * @param vertexList     原始的未加角色标注的序列
     * @param wordNetOptimum 待优化的图
     * @param wordNetAll
     */
public static void parsePattern(List<NS> nsList, List<Vertex> vertexList, final WordNet wordNetOptimum, final WordNet wordNetAll) {
    //        ListIterator<Vertex> listIterator = vertexList.listIterator();
    StringBuilder sbPattern = new StringBuilder(nsList.size());
    for (NS ns : nsList) {
        sbPattern.append(ns.toString());
    }
    String pattern = sbPattern.toString();
    final Vertex[] wordArray = vertexList.toArray(new Vertex[0]);
    trie.parseText(pattern, new AhoCorasickDoubleArrayTrie.IHit<String>() {

        @Override
        public void hit(int begin, int end, String value) {
            StringBuilder sbName = new StringBuilder();
            for (int i = begin; i < end; ++i) {
                sbName.append(wordArray[i].realWord);
            }
            String name = sbName.toString();
            // 对一些bad case做出调整
            if (isBadCase(name))
                return;
            // 正式算它是一个名字
            if (HanLP.Config.DEBUG) {
                System.out.printf("识别出地名:%s %s\n", name, value);
            }
            int offset = 0;
            for (int i = 0; i < begin; ++i) {
                offset += wordArray[i].realWord.length();
            }
            wordNetOptimum.insert(offset, new Vertex(Predefine.TAG_PLACE, name, ATTRIBUTE, WORD_ID), wordNetAll);
        }
    });
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) NS(com.hankcs.hanlp.corpus.tag.NS) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)

Example 2 with NS

use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.

the class PlaceRecognition method Recognition.

public static boolean Recognition(List<Vertex> pWordSegResult, WordNet wordNetOptimum, WordNet wordNetAll) {
    List<EnumItem<NS>> roleTagList = roleTag(pWordSegResult, wordNetAll);
    if (HanLP.Config.DEBUG) {
        StringBuilder sbLog = new StringBuilder();
        Iterator<Vertex> iterator = pWordSegResult.iterator();
        for (EnumItem<NS> NSEnumItem : roleTagList) {
            sbLog.append('[');
            sbLog.append(iterator.next().realWord);
            sbLog.append(' ');
            sbLog.append(NSEnumItem);
            sbLog.append(']');
        }
        System.out.printf("地名角色观察:%s\n", sbLog.toString());
    }
    List<NS> NSList = viterbiExCompute(roleTagList);
    if (HanLP.Config.DEBUG) {
        StringBuilder sbLog = new StringBuilder();
        Iterator<Vertex> iterator = pWordSegResult.iterator();
        sbLog.append('[');
        for (NS NS : NSList) {
            sbLog.append(iterator.next().realWord);
            sbLog.append('/');
            sbLog.append(NS);
            sbLog.append(" ,");
        }
        if (sbLog.length() > 1)
            sbLog.delete(sbLog.length() - 2, sbLog.length());
        sbLog.append(']');
        System.out.printf("地名角色标注:%s\n", sbLog.toString());
    }
    PlaceDictionary.parsePattern(NSList, pWordSegResult, wordNetOptimum, wordNetAll);
    return true;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) NS(com.hankcs.hanlp.corpus.tag.NS) EnumItem(com.hankcs.hanlp.corpus.dictionary.item.EnumItem)

Example 3 with NS

use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.

the class PlaceRecognition method roleTag.

public static List<EnumItem<NS>> roleTag(List<Vertex> vertexList, WordNet wordNetAll) {
    List<EnumItem<NS>> tagList = new LinkedList<EnumItem<NS>>();
    ListIterator<Vertex> listIterator = vertexList.listIterator();
    //        int line = 0;
    while (listIterator.hasNext()) {
        Vertex vertex = listIterator.next();
        //            }
        if (Nature.ns == vertex.getNature() && vertex.getAttribute().totalFrequency <= 1000) {
            if (// 二字地名,认为其可以再接一个后缀或前缀
            vertex.realWord.length() < 3)
                tagList.add(new EnumItem<NS>(NS.H, NS.G));
            else
                // 否则只可以再加后缀
                tagList.add(new EnumItem<NS>(NS.G));
            continue;
        }
        // 此处用等效词,更加精准
        EnumItem<NS> NSEnumItem = PlaceDictionary.dictionary.get(vertex.word);
        if (NSEnumItem == null) {
            NSEnumItem = new EnumItem<NS>(NS.Z, PlaceDictionary.transformMatrixDictionary.getTotalFrequency(NS.Z));
        }
        tagList.add(NSEnumItem);
    //            line += vertex.realWord.length();
    }
    return tagList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) NS(com.hankcs.hanlp.corpus.tag.NS) EnumItem(com.hankcs.hanlp.corpus.dictionary.item.EnumItem) LinkedList(java.util.LinkedList)

Example 4 with NS

use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.

the class PlaceRecognition method insert.

private static void insert(ListIterator<Vertex> listIterator, List<EnumItem<NS>> tagList, WordNet wordNetAll, int line, NS ns) {
    Vertex vertex = wordNetAll.getFirst(line);
    assert vertex != null : "全词网居然有空白行!";
    listIterator.add(vertex);
    tagList.add(new EnumItem<NS>(ns, 1000));
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) NS(com.hankcs.hanlp.corpus.tag.NS)

Example 5 with NS

use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.

the class NSDictionary method loadDat.

private EnumItem<NS>[] loadDat(String path) {
    byte[] bytes = IOUtil.readBytes(path);
    if (bytes == null)
        return null;
    NS[] NSArray = NS.values();
    int index = 0;
    int size = ByteUtil.bytesHighFirstToInt(bytes, index);
    index += 4;
    EnumItem<NS>[] valueArray = new EnumItem[size];
    for (int i = 0; i < size; ++i) {
        int currentSize = ByteUtil.bytesHighFirstToInt(bytes, index);
        index += 4;
        EnumItem<NS> item = new EnumItem<NS>();
        for (int j = 0; j < currentSize; ++j) {
            NS NS = NSArray[ByteUtil.bytesHighFirstToInt(bytes, index)];
            index += 4;
            int frequency = ByteUtil.bytesHighFirstToInt(bytes, index);
            index += 4;
            item.labelMap.put(NS, frequency);
        }
        valueArray[i] = item;
    }
    return valueArray;
}
Also used : NS(com.hankcs.hanlp.corpus.tag.NS) EnumItem(com.hankcs.hanlp.corpus.dictionary.item.EnumItem)

Aggregations

NS (com.hankcs.hanlp.corpus.tag.NS)5 Vertex (com.hankcs.hanlp.seg.common.Vertex)4 EnumItem (com.hankcs.hanlp.corpus.dictionary.item.EnumItem)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)1 LinkedList (java.util.LinkedList)1