Search in sources :

Example 1 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class DictionaryMaker method normalizeFrequency.

/**
     * 调整频次,按排序后的次序给定频次
     *
     * @param itemList
     * @return 处理后的列表
     */
public static List<Item> normalizeFrequency(List<Item> itemList) {
    for (Item item : itemList) {
        ArrayList<Map.Entry<String, Integer>> entryArray = new ArrayList<Map.Entry<String, Integer>>(item.labelMap.entrySet());
        Collections.sort(entryArray, new Comparator<Map.Entry<String, Integer>>() {

            @Override
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                return o1.getValue().compareTo(o2.getValue());
            }
        });
        int index = 1;
        for (Map.Entry<String, Integer> pair : entryArray) {
            item.labelMap.put(pair.getKey(), index);
            ++index;
        }
    }
    return itemList;
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item)

Example 2 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class DictionaryMaker method add.

/**
     * 插入条目
     *
     * @param item
     */
public void add(Item item) {
    Item innerItem = trie.get(item.key);
    if (innerItem == null) {
        innerItem = item;
        trie.put(innerItem.key, innerItem);
    } else {
        innerItem.combine(item);
    }
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item)

Example 3 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class DictionaryMaker method addNotCombine.

/**
     * 插入条目,但是不合并,如果已有则忽略
     *
     * @param item
     */
public void addNotCombine(Item item) {
    Item innerItem = trie.get(item.key);
    if (innerItem == null) {
        innerItem = item;
        trie.put(innerItem.key, innerItem);
    }
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item)

Example 4 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class DictionaryMaker method loadAsItemList.

/**
     * 读取所有条目
     *
     * @param path
     * @return
     */
public static List<Item> loadAsItemList(String path) {
    List<Item> itemList = new LinkedList<Item>();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(IOAdapter == null ? new FileInputStream(path) : IOAdapter.open(path), "UTF-8"));
        String line;
        while ((line = br.readLine()) != null) {
            Item item = Item.create(line);
            if (item == null) {
                logger.warning("使用【" + line + "】创建Item失败");
                return null;
            //                    continue;
            }
            itemList.add(item);
        }
    } catch (Exception e) {
        logger.warning("读取词典" + path + "发生异常" + e);
        return null;
    }
    return itemList;
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item)

Example 5 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestCustomDictionary method testRemoveJunkWord.

public void testRemoveJunkWord() throws Exception {
    DictionaryMaker dictionaryMaker = DictionaryMaker.load("data/dictionary/custom/CustomDictionary.txt");
    dictionaryMaker.saveTxtTo("data/dictionary/custom/CustomDictionary.txt", new DictionaryMaker.Filter() {

        @Override
        public boolean onSave(Item item) {
            if (item.containsLabel("mq") || item.containsLabel("m") || item.containsLabel("t")) {
                return false;
            }
            return true;
        }
    });
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Aggregations

Item (com.hankcs.hanlp.corpus.dictionary.item.Item)14 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)8 Map (java.util.Map)3 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)1 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)1 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)1 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)1 Segment (com.hankcs.hanlp.seg.Segment)1 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 TreeMap (java.util.TreeMap)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1