Search in sources :

Example 11 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestXianDaiHanYu method testMakeNatureDictionary.

public void testMakeNatureDictionary() throws Exception {
    String text = IOUtil.readTxt("D:\\Doc\\语料库\\现代汉语词典(第五版)全文_更新.txt").toLowerCase();
    //        String text = "【岸标】ànbiāo名设在岸上指示航行的标志,可以使船舶避开沙滩、暗礁等。\n" +
    //                "\n" +
    //                "【岸炮】ànpào名海岸炮的简称。\n" +
    //                "\n" +
    //                "【岸然】ànrán〈书〉形严肃的样子:道貌~。\n" +
    //                "\n" +
    //                "【按】1àn①动用手或指头压:~电铃|~图钉。②动压住;搁下:~兵不动|~下此事不说。③动抑制:~不住心头怒火。④介依照:~时|~质论价|~制度办事|~每人两本计算。\n" +
    //                "另见237页cuō。\n" +
    //                "现用替代字【錣】*  原图片字[钅+叕]\n" +
    //                "现用替代字【騣】*  原图片字[马+㚇]\n" +
    //                "现用替代字【緅】*  原图片字[纟+取]";
    Pattern pattern = Pattern.compile("【([\\u4E00-\\u9FA5]{2,10})】.{0,5}([abcdefghijklmnopqrstuwxyzāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ∥•’-]+)(.*)");
    Matcher matcher = pattern.matcher(text);
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    dictionaryMaker.add("希望 v 7685 vn 616");
    Map<String, String> mapChineseToNature = new TreeMap<String, String>();
    mapChineseToNature.put("名", Nature.n.toString());
    mapChineseToNature.put("动", Nature.v.toString());
    mapChineseToNature.put("形", Nature.a.toString());
    mapChineseToNature.put("副", Nature.d.toString());
    mapChineseToNature.put("形容", Nature.a.toString());
    while (matcher.find()) {
        String word = matcher.group(1);
        if (CoreDictionary.contains(word) || CustomDictionary.contains(word))
            continue;
        String content = matcher.group(3);
        Item item = new Item(word);
        for (Map.Entry<String, String> entry : mapChineseToNature.entrySet()) {
            int frequency = TextUtility.count(entry.getKey(), content);
            if (frequency > 0)
                item.addLabel(entry.getValue(), frequency);
        }
        if (item.getTotalFrequency() == 0)
            item.addLabel(Nature.nz.toString());
        //            System.out.println(item);
        dictionaryMaker.add(item);
    }
    dictionaryMaker.saveTxtTo("data/dictionary/custom/现代汉语补充词库.txt");
}
Also used : Pattern(java.util.regex.Pattern) Item(com.hankcs.hanlp.corpus.dictionary.item.Item) Matcher(java.util.regex.Matcher) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 12 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestCustomDictionary method testRemoveNotNS.

/**
     * data/dictionary/custom/全国地名大全.txt中有很多人名,删掉它们
     * @throws Exception
     */
public void testRemoveNotNS() throws Exception {
    String path = "data/dictionary/custom/全国地名大全.txt";
    final Set<Character> suffixSet = new TreeSet<Character>();
    for (char c : Predefine.POSTFIX_SINGLE.toCharArray()) {
        suffixSet.add(c);
    }
    DictionaryMaker.load(path).saveTxtTo(path, new DictionaryMaker.Filter() {

        Segment segment = HanLP.newSegment().enableCustomDictionary(false);

        @Override
        public boolean onSave(Item item) {
            if (suffixSet.contains(item.key.charAt(item.key.length() - 1)))
                return true;
            List<Term> termList = segment.seg(item.key);
            if (termList.size() == 1 && termList.get(0).nature == Nature.nr) {
                System.out.println(item);
                return false;
            }
            return true;
        }
    });
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) Segment(com.hankcs.hanlp.seg.Segment)

Example 13 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestDependencyCorpus method testPosTag.

/**
     * 细粒度转粗粒度
     *
     * @throws Exception
     */
public void testPosTag() throws Exception {
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    LinkedList<CoNLLSentence> coNLLSentences = CoNLLLoader.loadSentenceList("D:\\Doc\\语料库\\依存分析训练数据\\THU\\dev.conll.fixed.txt");
    for (CoNLLSentence coNLLSentence : coNLLSentences) {
        for (CoNLLWord coNLLWord : coNLLSentence.word) {
            dictionaryMaker.add(new Item(coNLLWord.POSTAG, coNLLWord.CPOSTAG));
        }
    }
    System.out.println(dictionaryMaker.entrySet());
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 14 with Item

use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.

the class TestNTRecognition method testRemoveP.

public void testRemoveP() throws Exception {
    DictionaryMaker maker = DictionaryMaker.load(HanLP.Config.OrganizationDictionaryPath);
    for (Map.Entry<String, Item> entry : maker.entrySet()) {
        String word = entry.getKey();
        Item item = entry.getValue();
        CoreDictionary.Attribute attribute = LexiconUtility.getAttribute(word);
        if (attribute == null)
            continue;
        if (item.containsLabel("P") && attribute.hasNatureStartsWith("u")) {
            System.out.println(item + "\t" + attribute);
            item.removeLabel("P");
        }
    }
    maker.saveTxtTo(HanLP.Config.OrganizationDictionaryPath);
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary) Map(java.util.Map)

Aggregations

Item (com.hankcs.hanlp.corpus.dictionary.item.Item)14 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)8 Map (java.util.Map)3 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)1 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)1 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)1 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)1 Segment (com.hankcs.hanlp.seg.Segment)1 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 TreeMap (java.util.TreeMap)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1