Search in sources :

Example 1 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class Viterbi method compute.

/**
     * 特化版的求解HMM模型
     *
     * @param vertexList                包含Vertex.B节点的路径
     * @param transformMatrixDictionary 词典对应的转移矩阵
     */
public static void compute(List<Vertex> vertexList, TransformMatrixDictionary<Nature> transformMatrixDictionary) {
    int length = vertexList.size() - 1;
    // 滚动数组
    double[][] cost = new double[2][];
    Iterator<Vertex> iterator = vertexList.iterator();
    Vertex start = iterator.next();
    Nature pre = start.attribute.nature[0];
    // 第一个是确定的
    //        start.confirmNature(pre);
    // 第二个也可以简单地算出来
    Vertex preItem;
    Nature[] preTagSet;
    {
        Vertex item = iterator.next();
        cost[0] = new double[item.attribute.nature.length];
        int j = 0;
        int curIndex = 0;
        for (Nature cur : item.attribute.nature) {
            cost[0][j] = transformMatrixDictionary.transititon_probability[pre.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[curIndex] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
            ++j;
            ++curIndex;
        }
        preTagSet = item.attribute.nature;
        preItem = item;
    }
    // 第三个开始复杂一些
    for (int i = 1; i < length; ++i) {
        int index_i = i & 1;
        int index_i_1 = 1 - index_i;
        Vertex item = iterator.next();
        cost[index_i] = new double[item.attribute.nature.length];
        double perfect_cost_line = Double.MAX_VALUE;
        int k = 0;
        Nature[] curTagSet = item.attribute.nature;
        for (Nature cur : curTagSet) {
            cost[index_i][k] = Double.MAX_VALUE;
            int j = 0;
            for (Nature p : preTagSet) {
                double now = cost[index_i_1][j] + transformMatrixDictionary.transititon_probability[p.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[k] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
                if (now < cost[index_i][k]) {
                    cost[index_i][k] = now;
                    if (now < perfect_cost_line) {
                        perfect_cost_line = now;
                        pre = p;
                    }
                }
                ++j;
            }
            ++k;
        }
        preItem.confirmNature(pre);
        preTagSet = curTagSet;
        preItem = item;
    }
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Vertex(com.hankcs.hanlp.seg.common.Vertex)

Example 2 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class CustomDictionary method loadDat.

/**
     * 从磁盘加载双数组
     *
     * @param path
     * @return
     */
static boolean loadDat(String path) {
    try {
        ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
        if (byteArray == null)
            return false;
        int size = byteArray.nextInt();
        if (// 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性
        size < 0) {
            while (++size <= 0) {
                Nature.create(byteArray.nextString());
            }
            size = byteArray.nextInt();
        }
        CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size];
        final Nature[] natureIndexArray = Nature.values();
        for (int i = 0; i < size; ++i) {
            // 第一个是全部频次,第二个是词性个数
            int currentTotalFrequency = byteArray.nextInt();
            int length = byteArray.nextInt();
            attributes[i] = new CoreDictionary.Attribute(length);
            attributes[i].totalFrequency = currentTotalFrequency;
            for (int j = 0; j < length; ++j) {
                attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
                attributes[i].frequency[j] = byteArray.nextInt();
            }
        }
        if (!dat.load(byteArray, attributes))
            return false;
    } catch (Exception e) {
        logger.warning("读取失败,问题发生在" + TextUtility.exceptionToString(e));
        return false;
    }
    return true;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) ByteArray(com.hankcs.hanlp.corpus.io.ByteArray)

Example 3 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class CustomDictionary method loadMainDictionary.

private static boolean loadMainDictionary(String mainPath) {
    logger.info("自定义词典开始加载:" + mainPath);
    if (loadDat(mainPath))
        return true;
    dat = new DoubleArrayTrie<CoreDictionary.Attribute>();
    TreeMap<String, CoreDictionary.Attribute> map = new TreeMap<String, CoreDictionary.Attribute>();
    LinkedHashSet<Nature> customNatureCollector = new LinkedHashSet<Nature>();
    try {
        for (String p : path) {
            Nature defaultNature = Nature.n;
            int cut = p.indexOf(' ');
            if (cut > 0) {
                // 有默认词性
                String nature = p.substring(cut + 1);
                p = p.substring(0, cut);
                try {
                    defaultNature = LexiconUtility.convertStringToNature(nature, customNatureCollector);
                } catch (Exception e) {
                    logger.severe("配置文件【" + p + "】写错了!" + e);
                    continue;
                }
            }
            logger.info("以默认词性[" + defaultNature + "]加载自定义词典" + p + "中……");
            boolean success = load(p, defaultNature, map, customNatureCollector);
            if (!success)
                logger.warning("失败:" + p);
        }
        if (map.size() == 0) {
            logger.warning("没有加载到任何词条");
            // 当作空白占位符
            map.put(Predefine.TAG_OTHER, null);
        }
        logger.info("正在构建DoubleArrayTrie……");
        dat.build(map);
        // 缓存成dat文件,下次加载会快很多
        logger.info("正在缓存词典为dat文件……");
        // 缓存值文件
        List<CoreDictionary.Attribute> attributeList = new LinkedList<CoreDictionary.Attribute>();
        for (Map.Entry<String, CoreDictionary.Attribute> entry : map.entrySet()) {
            attributeList.add(entry.getValue());
        }
        DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(mainPath + Predefine.BIN_EXT));
        // 缓存用户词性
        IOUtil.writeCustomNature(out, customNatureCollector);
        // 缓存正文
        out.writeInt(attributeList.size());
        for (CoreDictionary.Attribute attribute : attributeList) {
            attribute.save(out);
        }
        dat.save(out);
        out.close();
    } catch (FileNotFoundException e) {
        logger.severe("自定义词典" + mainPath + "不存在!" + e);
        return false;
    } catch (IOException e) {
        logger.severe("自定义词典" + mainPath + "读取错误!" + e);
        return false;
    } catch (Exception e) {
        logger.warning("自定义词典" + mainPath + "缓存失败!\n" + TextUtility.exceptionToString(e));
    }
    return true;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature)

Example 4 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class OrganizationRecognition method roleTag.

public static List<EnumItem<NT>> roleTag(List<Vertex> vertexList, WordNet wordNetAll) {
    List<EnumItem<NT>> tagList = new LinkedList<EnumItem<NT>>();
    //        int line = 0;
    for (Vertex vertex : vertexList) {
        // 构成更长的
        Nature nature = vertex.guessNature();
        switch(nature) {
            case nrf:
                {
                    if (vertex.getAttribute().totalFrequency <= 1000) {
                        tagList.add(new EnumItem<NT>(NT.F, 1000));
                    } else
                        break;
                }
                continue;
            case ni:
            case nic:
            case nis:
            case nit:
                {
                    EnumItem<NT> ntEnumItem = new EnumItem<NT>(NT.K, 1000);
                    ntEnumItem.addLabel(NT.D, 1000);
                    tagList.add(ntEnumItem);
                }
                continue;
            case m:
                {
                    EnumItem<NT> ntEnumItem = new EnumItem<NT>(NT.M, 1000);
                    tagList.add(ntEnumItem);
                }
                continue;
        }
        // 此处用等效词,更加精准
        EnumItem<NT> NTEnumItem = OrganizationDictionary.dictionary.get(vertex.word);
        if (NTEnumItem == null) {
            NTEnumItem = new EnumItem<NT>(NT.Z, OrganizationDictionary.transformMatrixDictionary.getTotalFrequency(NT.Z));
        }
        tagList.add(NTEnumItem);
    //            line += vertex.realWord.length();
    }
    return tagList;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Vertex(com.hankcs.hanlp.seg.common.Vertex) NT(com.hankcs.hanlp.corpus.tag.NT) EnumItem(com.hankcs.hanlp.corpus.dictionary.item.EnumItem) LinkedList(java.util.LinkedList)

Example 5 with Nature

use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.

the class AtomNode method getNature.

/**
     * 原子的词性
     * @return
     */
public Nature getNature() {
    Nature nature = Nature.nz;
    switch(nPOS) {
        case Predefine.CT_CHINESE:
            break;
        case Predefine.CT_INDEX:
        case Predefine.CT_NUM:
            nature = Nature.m;
            sWord = "未##数";
            break;
        case Predefine.CT_DELIMITER:
            nature = Nature.w;
            break;
        case Predefine.CT_LETTER:
            nature = Nature.nx;
            sWord = "未##串";
            break;
        case //12021-2129-3121
        Predefine.CT_SINGLE:
            if (//匹配浮点数
            Predefine.PATTERN_FLOAT_NUMBER.matcher(sWord).matches()) {
                nature = Nature.m;
                sWord = "未##数";
            } else {
                nature = Nature.nx;
                sWord = "未##串";
            }
            break;
        default:
            break;
    }
    return nature;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature)

Aggregations

Nature (com.hankcs.hanlp.corpus.tag.Nature)14 Vertex (com.hankcs.hanlp.seg.common.Vertex)4 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)3 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)3 Term (com.hankcs.hanlp.seg.common.Term)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 ByteArray (com.hankcs.hanlp.corpus.io.ByteArray)2 LinkedList (java.util.LinkedList)2 DoubleArrayTrie (com.hankcs.hanlp.collection.trie.DoubleArrayTrie)1 EnumItem (com.hankcs.hanlp.corpus.dictionary.item.EnumItem)1 NT (com.hankcs.hanlp.corpus.tag.NT)1