Search in sources :

Example 1 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class NRCorpusLoader method combine.

public static void combine() {
    DictionaryMaker dictionaryMaker = DictionaryMaker.combine(HanLP.Config.CoreDictionaryPath, "XXXDictionary.txt");
    dictionaryMaker.saveTxtTo(HanLP.Config.CoreDictionaryPath);
}
Also used : DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 2 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class NameDictionaryMaker method create.

public static DictionaryMaker create(String path) {
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
        String line;
        while ((line = br.readLine()) != null) {
            if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
                continue;
            // 只载入两字和三字的名字
            Integer length = line.length();
            switch(length) {
                case 2:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        if (!FamilyName.contains(wordB.value))
                            break;
                        Word wordE = new Word(line.substring(1), NR.E.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordE);
                        break;
                    }
                case 3:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        if (!FamilyName.contains(wordB.value))
                            break;
                        Word wordC = new Word(line.substring(1, 2), NR.C.toString());
                        Word wordD = new Word(line.substring(2, 3), NR.D.toString());
                        //                        Word wordC = new Word(line.substring(1, 2), NR.E.toString());
                        //                        Word wordD = new Word(line.substring(2, 3), NR.E.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordC);
                        dictionaryMaker.add(wordD);
                        break;
                    }
                default:
                    //                        L.trace("放弃【{}】", line);
                    break;
            }
        }
        br.close();
        logger.info(dictionaryMaker.toString());
    } catch (Exception e) {
        logger.warning("读取" + path + "发生错误");
        return null;
    }
    return dictionaryMaker;
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) FileInputStream(java.io.FileInputStream)

Example 3 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class TestCorpusLoader method testMakeOrganizationCustomDictionary.

public void testMakeOrganizationCustomDictionary() throws Exception {
    final DictionaryMaker dictionaryMaker = new DictionaryMaker();
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
            for (List<IWord> wordList : complexSentenceList) {
                for (IWord word : wordList) {
                    if (word.getLabel().startsWith("nt")) {
                        dictionaryMaker.add(word);
                    }
                }
            }
        }
    });
    dictionaryMaker.saveTxtTo("data/dictionary/custom/机构名词典.txt");
}
Also used : CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) List(java.util.List) Document(com.hankcs.hanlp.corpus.document.Document) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 4 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class TestCustomDictionary method testRemoveJunkWord.

public void testRemoveJunkWord() throws Exception {
    DictionaryMaker dictionaryMaker = DictionaryMaker.load("data/dictionary/custom/CustomDictionary.txt");
    dictionaryMaker.saveTxtTo("data/dictionary/custom/CustomDictionary.txt", new DictionaryMaker.Filter() {

        @Override
        public boolean onSave(Item item) {
            if (item.containsLabel("mq") || item.containsLabel("m") || item.containsLabel("t")) {
                return false;
            }
            return true;
        }
    });
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 5 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class TestAdjustCoreDictionary method testRemoveNumber.

public void testRemoveNumber() throws Exception {
    // 一些汉字数词留着没用,除掉它们
    DictionaryMaker dictionaryMaker = DictionaryMaker.load(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT);
    dictionaryMaker.saveTxtTo(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT, new DictionaryMaker.Filter() {

        @Override
        public boolean onSave(Item item) {
            if (item.key.length() == 1 && "0123456789零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".indexOf(item.key.charAt(0)) >= 0) {
                System.out.println(item);
                return false;
            }
            return true;
        }
    });
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Aggregations

DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)17 Item (com.hankcs.hanlp.corpus.dictionary.item.Item)7 Document (com.hankcs.hanlp.corpus.document.Document)5 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)4 IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)3 List (java.util.List)3 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)2 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)2 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)2 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 InputStreamReader (java.io.InputStreamReader)2 Map (java.util.Map)2 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)1 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)1 File (java.io.File)1 TreeSet (java.util.TreeSet)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1