Search in sources :

Example 6 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestDictionaryMaker method testLoadItemList.

public void testLoadItemList() throws Exception {
    List<Item> itemList = DictionaryMaker.loadAsItemList("data/2014_dictionary.txt");
    Map<String, Integer> labelMap = new TreeMap<String, Integer>();
    for (Item item : itemList) {
        for (Map.Entry<String, Integer> entry : item.labelMap.entrySet()) {
            Integer frequency = labelMap.get(entry.getKey());
            if (frequency == null)
                frequency = 0;
            labelMap.put(entry.getKey(), frequency + entry.getValue());
        }
    }
    for (String label : labelMap.keySet()) {
        System.out.println(label);
    }
    System.out.println(labelMap.size());
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) TreeMap(java.util.TreeMap) TreeMap(java.util.TreeMap) Map(java.util.Map)

Example 7 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestAdjustCoreDictionary method testRemoveNumber.

public void testRemoveNumber() throws Exception {
    // 一些汉字数词留着没用,除掉它们
    DictionaryMaker dictionaryMaker = DictionaryMaker.load(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT);
    dictionaryMaker.saveTxtTo(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT, new DictionaryMaker.Filter() {

        @Override
        public boolean onSave(Item item) {
            if (item.key.length() == 1 && "0123456789零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".indexOf(item.key.charAt(0)) >= 0) {
                System.out.println(item);
                return false;
            }
            return true;
        }
    });
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 8 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestAdjustCoreDictionary method testGetCompiledWordFromDictionary.

public void testGetCompiledWordFromDictionary() throws Exception {
    DictionaryMaker dictionaryMaker = DictionaryMaker.load("data/test/CoreNatureDictionary.txt");
    for (Map.Entry<String, Item> entry : dictionaryMaker.entrySet()) {
        String word = entry.getKey();
        Item item = entry.getValue();
        if (word.matches(".##.")) {
            System.out.println(item);
        }
    }
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) Map(java.util.Map)

Example 9 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestXianDaiHanYu method testMakeNatureDictionary.

public void testMakeNatureDictionary() throws Exception {
    String text = IOUtil.readTxt("D:\\Doc\\语料库\\现代汉语词典(第五版)全文_更新.txt").toLowerCase();
    //        String text = "【岸标】ànbiāo名设在岸上指示航行的标志,可以使船舶避开沙滩、暗礁等。\n" +
    //                "\n" +
    //                "【岸炮】ànpào名海岸炮的简称。\n" +
    //                "\n" +
    //                "【岸然】ànrán〈书〉形严肃的样子:道貌~。\n" +
    //                "\n" +
    //                "【按】1àn①动用手或指头压:~电铃|~图钉。②动压住;搁下:~兵不动|~下此事不说。③动抑制:~不住心头怒火。④介依照:~时|~质论价|~制度办事|~每人两本计算。\n" +
    //                "另见237页cuō。\n" +
    //                "现用替代字【錣】*  原图片字[钅+叕]\n" +
    //                "现用替代字【騣】*  原图片字[马+㚇]\n" +
    //                "现用替代字【緅】*  原图片字[纟+取]";
    Pattern pattern = Pattern.compile("【([\\u4E00-\\u9FA5]{2,10})】.{0,5}([abcdefghijklmnopqrstuwxyzāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ∥•’-]+)(.*)");
    Matcher matcher = pattern.matcher(text);
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    dictionaryMaker.add("希望 v 7685 vn 616");
    Map<String, String> mapChineseToNature = new TreeMap<String, String>();
    mapChineseToNature.put("名", Nature.n.toString());
    mapChineseToNature.put("动", Nature.v.toString());
    mapChineseToNature.put("形", Nature.a.toString());
    mapChineseToNature.put("副", Nature.d.toString());
    mapChineseToNature.put("形容", Nature.a.toString());
    while (matcher.find()) {
        String word = matcher.group(1);
        if (CoreDictionary.contains(word) || CustomDictionary.contains(word))
            continue;
        String content = matcher.group(3);
        Item item = new Item(word);
        for (Map.Entry<String, String> entry : mapChineseToNature.entrySet()) {
            int frequency = TextUtility.count(entry.getKey(), content);
            if (frequency > 0)
                item.addLabel(entry.getValue(), frequency);
        }
        if (item.getTotalFrequency() == 0)
            item.addLabel(Nature.nz.toString());
        //            System.out.println(item);
        dictionaryMaker.add(item);
    }
    dictionaryMaker.saveTxtTo("data/dictionary/custom/现代汉语补充词库.txt");
}
Also used : Pattern(java.util.regex.Pattern) Item(com.hankcs.hanlp.corpus.dictionary.item.Item) Matcher(java.util.regex.Matcher) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 10 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestCustomDictionary method testRemoveNotNS.

/**
     * data/dictionary/custom/全国地名大全.txt中有很多人名,删掉它们
     * @throws Exception
     */
public void testRemoveNotNS() throws Exception {
    String path = "data/dictionary/custom/全国地名大全.txt";
    final Set<Character> suffixSet = new TreeSet<Character>();
    for (char c : Predefine.POSTFIX_SINGLE.toCharArray()) {
        suffixSet.add(c);
    }
    DictionaryMaker.load(path).saveTxtTo(path, new DictionaryMaker.Filter() {

        Segment segment = HanLP.newSegment().enableCustomDictionary(false);

        @Override
        public boolean onSave(Item item) {
            if (suffixSet.contains(item.key.charAt(item.key.length() - 1)))
                return true;
            List<Term> termList = segment.seg(item.key);
            if (termList.size() == 1 && termList.get(0).nature == Nature.nr) {
                System.out.println(item);
                return false;
            }
            return true;
        }
    });
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) Segment(com.hankcs.hanlp.seg.Segment)

Aggregations

Item (com.hankcs.hanlp.corpus.dictionary.item.Item)14 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)8 Map (java.util.Map)3 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)1 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)1 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)1 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)1 Segment (com.hankcs.hanlp.seg.Segment)1 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 TreeMap (java.util.TreeMap)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1