Search in sources :

Example 6 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class NRCorpusLoader method combine.

public static void combine() {
    DictionaryMaker dictionaryMaker = DictionaryMaker.combine(HanLP.Config.CoreDictionaryPath, "XXXDictionary.txt");
    dictionaryMaker.saveTxtTo(HanLP.Config.CoreDictionaryPath);
}
Also used : DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 7 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class NameDictionaryMaker method create.

public static DictionaryMaker create(String path) {
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
        String line;
        while ((line = br.readLine()) != null) {
            if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
                continue;
            // 只载入两字和三字的名字
            Integer length = line.length();
            switch(length) {
                case 2:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        if (!FamilyName.contains(wordB.value))
                            break;
                        Word wordE = new Word(line.substring(1), NR.E.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordE);
                        break;
                    }
                case 3:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        if (!FamilyName.contains(wordB.value))
                            break;
                        Word wordC = new Word(line.substring(1, 2), NR.C.toString());
                        Word wordD = new Word(line.substring(2, 3), NR.D.toString());
                        //                        Word wordC = new Word(line.substring(1, 2), NR.E.toString());
                        //                        Word wordD = new Word(line.substring(2, 3), NR.E.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordC);
                        dictionaryMaker.add(wordD);
                        break;
                    }
                default:
                    //                        L.trace("放弃【{}】", line);
                    break;
            }
        }
        br.close();
        logger.info(dictionaryMaker.toString());
    } catch (Exception e) {
        logger.warning("读取" + path + "发生错误");
        return null;
    }
    return dictionaryMaker;
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) FileInputStream(java.io.FileInputStream)

Example 8 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class WordNatureWeightModelMaker method makeModel.

public static boolean makeModel(String corpusLoadPath, String modelSavePath) {
    Set<String> posSet = new TreeSet<String>();
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
        for (CoNLLWord word : sentence.word) {
            addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            posSet.add(word.POSTAG);
        }
    }
    for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
        for (CoNLLWord word : sentence.word) {
            addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
            addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
            posSet.add(word.POSTAG);
        }
    }
    StringBuilder sb = new StringBuilder();
    for (String pos : posSet) {
        sb.append("case \"" + pos + "\":\n");
    }
    IOUtil.saveTxt("data/model/dependency/pos-thu.txt", sb.toString());
    return dictionaryMaker.saveTxtTo(modelSavePath);
}
Also used : TreeSet(java.util.TreeSet) CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Example 9 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class NRCorpusLoader method load.

public static boolean load(String path) {
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
        String line;
        DictionaryMaker dictionaryMaker = new DictionaryMaker();
        while ((line = br.readLine()) != null) {
            if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
                continue;
            // 只载入两字和三字的名字
            Integer length = line.length();
            switch(length) {
                case 2:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        Word wordE = new Word(line.substring(1), NR.E.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordE);
                        break;
                    }
                case 3:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        Word wordC = new Word(line.substring(1, 2), NR.C.toString());
                        Word wordD = new Word(line.substring(2, 3), NR.D.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordC);
                        dictionaryMaker.add(wordD);
                        break;
                    }
                default:
                    //                        L.trace("放弃【{}】", line);
                    break;
            }
        }
        br.close();
        logger.info(dictionaryMaker.toString());
        dictionaryMaker.saveTxtTo("data/dictionary/person/name.txt", new DictionaryMaker.Filter() {

            @Override
            public boolean onSave(Item item) {
                return false;
            }
        });
    } catch (Exception e) {
        logger.warning("读取" + path + "发生错误");
        return false;
    }
    return true;
}
Also used : Item(com.hankcs.hanlp.corpus.dictionary.item.Item) Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) FileInputStream(java.io.FileInputStream)

Example 10 with DictionaryMaker

use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.

the class TestXianDaiHanYu method testMakeNatureDictionary.

public void testMakeNatureDictionary() throws Exception {
    String text = IOUtil.readTxt("D:\\Doc\\语料库\\现代汉语词典(第五版)全文_更新.txt").toLowerCase();
    //        String text = "【岸标】ànbiāo名设在岸上指示航行的标志,可以使船舶避开沙滩、暗礁等。\n" +
    //                "\n" +
    //                "【岸炮】ànpào名海岸炮的简称。\n" +
    //                "\n" +
    //                "【岸然】ànrán〈书〉形严肃的样子:道貌~。\n" +
    //                "\n" +
    //                "【按】1àn①动用手或指头压:~电铃|~图钉。②动压住;搁下:~兵不动|~下此事不说。③动抑制:~不住心头怒火。④介依照:~时|~质论价|~制度办事|~每人两本计算。\n" +
    //                "另见237页cuō。\n" +
    //                "现用替代字【錣】*  原图片字[钅+叕]\n" +
    //                "现用替代字【騣】*  原图片字[马+㚇]\n" +
    //                "现用替代字【緅】*  原图片字[纟+取]";
    Pattern pattern = Pattern.compile("【([\\u4E00-\\u9FA5]{2,10})】.{0,5}([abcdefghijklmnopqrstuwxyzāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ∥•’-]+)(.*)");
    Matcher matcher = pattern.matcher(text);
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    dictionaryMaker.add("希望 v 7685 vn 616");
    Map<String, String> mapChineseToNature = new TreeMap<String, String>();
    mapChineseToNature.put("名", Nature.n.toString());
    mapChineseToNature.put("动", Nature.v.toString());
    mapChineseToNature.put("形", Nature.a.toString());
    mapChineseToNature.put("副", Nature.d.toString());
    mapChineseToNature.put("形容", Nature.a.toString());
    while (matcher.find()) {
        String word = matcher.group(1);
        if (CoreDictionary.contains(word) || CustomDictionary.contains(word))
            continue;
        String content = matcher.group(3);
        Item item = new Item(word);
        for (Map.Entry<String, String> entry : mapChineseToNature.entrySet()) {
            int frequency = TextUtility.count(entry.getKey(), content);
            if (frequency > 0)
                item.addLabel(entry.getValue(), frequency);
        }
        if (item.getTotalFrequency() == 0)
            item.addLabel(Nature.nz.toString());
        //            System.out.println(item);
        dictionaryMaker.add(item);
    }
    dictionaryMaker.saveTxtTo("data/dictionary/custom/现代汉语补充词库.txt");
}
Also used : Pattern(java.util.regex.Pattern) Item(com.hankcs.hanlp.corpus.dictionary.item.Item) Matcher(java.util.regex.Matcher) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)

Aggregations

DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)17 Item (com.hankcs.hanlp.corpus.dictionary.item.Item)7 Document (com.hankcs.hanlp.corpus.document.Document)5 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)4 IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)3 List (java.util.List)3 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)2 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)2 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)2 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 InputStreamReader (java.io.InputStreamReader)2 Map (java.util.Map)2 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)1 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)1 File (java.io.File)1 TreeSet (java.util.TreeSet)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1