Search in sources :

Example 1 with TFDictionary

use of com.hankcs.hanlp.corpus.dictionary.TFDictionary in project HanLP by hankcs.

the class SimplifyNGramDictionary method testSimplify.

public void testSimplify() throws Exception {
    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
    TreeMap<String, Integer> map = new TreeMap<String, Integer>();
    String line;
    while ((line = br.readLine()) != null) {
        String[] param = line.split("\\s");
        map.put(param[0], Integer.valueOf(param[1]));
    }
    br.close();
    Set<Map.Entry<String, Integer>> entrySet = map.descendingMap().entrySet();
    Iterator<Map.Entry<String, Integer>> iterator = entrySet.iterator();
    // 第一步去包含
    //        Map.Entry<String, Integer> pre = new AbstractMap.SimpleEntry<>(" @ ", 1);
    //        while (iterator.hasNext())
    //        {
    //            Map.Entry<String, Integer> current = iterator.next();
    //            if (current.getKey().length() - current.getKey().indexOf('@') == 2 && pre.getKey().indexOf(current.getKey()) == 0 && current.getValue() <= 2)
    //            {
    //                System.out.println("应当删除 " + current + " 保留 " + pre);
    //                iterator.remove();
    //            }
    //            pre = current;
    //        }
    // 第二步,尝试移除“学@家”这样的短共现
    //        iterator = entrySet.iterator();
    //        while (iterator.hasNext())
    //        {
    //            Map.Entry<String, Integer> current = iterator.next();
    //            if (current.getKey().length() == 3)
    //            {
    //                System.out.println("应当删除 " + current);
    //            }
    //        }
    // 第三步,对某些@后面的词语太短了,也移除
    //        iterator = entrySet.iterator();
    //        while (iterator.hasNext())
    //        {
    //            Map.Entry<String, Integer> current = iterator.next();
    //            String[] termArray = current.getKey().split("@", 2);
    //            if (termArray[0].equals("未##人") && termArray[1].length() < 2)
    //            {
    //                System.out.println("删除 " + current.getKey());
    //                iterator.remove();
    //            }
    //        }
    // 第四步,人名接续对识别产生太多误命中影响,也删除
    //        iterator = entrySet.iterator();
    //        while (iterator.hasNext())
    //        {
    //            Map.Entry<String, Integer> current = iterator.next();
    //            if (current.getKey().contains("未##人") && current.getValue() < 10)
    //            {
    //                System.out.println("删除 " + current.getKey());
    //                iterator.remove();
    //            }
    //        }
    // 对人名的终极调优
    TFDictionary dictionary = new TFDictionary();
    dictionary.load("D:\\JavaProjects\\HanLP\\data\\dictionary\\CoreNatureDictionary.ngram.mini.txt");
    iterator = entrySet.iterator();
    while (iterator.hasNext()) {
        Map.Entry<String, Integer> current = iterator.next();
        if (current.getKey().contains("未##人") && dictionary.getFrequency(current.getKey()) < 10) {
            System.out.println("删除 " + current.getKey());
            iterator.remove();
        }
    }
    // 输出
    BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path)));
    for (Map.Entry<String, Integer> entry : map.entrySet()) {
        bw.write(entry.getKey());
        bw.write(' ');
        bw.write(String.valueOf(entry.getValue()));
        bw.newLine();
    }
    bw.close();
}
Also used : TFDictionary(com.hankcs.hanlp.corpus.dictionary.TFDictionary)

Example 2 with TFDictionary

use of com.hankcs.hanlp.corpus.dictionary.TFDictionary in project HanLP by hankcs.

the class AdjustCorpus method testPlay.

public void testPlay() throws Exception {
    final TFDictionary tfDictionary = new TFDictionary();
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            for (List<IWord> wordList : document.getComplexSentenceList()) {
                for (IWord word : wordList) {
                    if (word instanceof CompoundWord && word.getLabel().equals("ns")) {
                        tfDictionary.add(word.toString());
                    }
                }
            }
        }
    });
    tfDictionary.saveTxtTo("data/test/complex_ns.txt");
}
Also used : TFDictionary(com.hankcs.hanlp.corpus.dictionary.TFDictionary) CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) List(java.util.List) Document(com.hankcs.hanlp.corpus.document.Document) CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 3 with TFDictionary

use of com.hankcs.hanlp.corpus.dictionary.TFDictionary in project HanLP by hankcs.

the class TestAdjustCoreDictionary method testViewNGramDictionary.

public void testViewNGramDictionary() throws Exception {
    TFDictionary tfDictionary = new TFDictionary();
    tfDictionary.load("data/dictionary/CoreNatureDictionary.ngram.txt");
    for (Map.Entry<String, TermFrequency> entry : tfDictionary.entrySet()) {
        String word = entry.getKey();
        TermFrequency frequency = entry.getValue();
        if (word.contains("##")) {
            System.out.println(frequency);
        }
    }
}
Also used : TFDictionary(com.hankcs.hanlp.corpus.dictionary.TFDictionary) TermFrequency(com.hankcs.hanlp.corpus.occurrence.TermFrequency) Map(java.util.Map)

Aggregations

TFDictionary (com.hankcs.hanlp.corpus.dictionary.TFDictionary)3 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)1 Document (com.hankcs.hanlp.corpus.document.Document)1 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)1 IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)1 TermFrequency (com.hankcs.hanlp.corpus.occurrence.TermFrequency)1 List (java.util.List)1 Map (java.util.Map)1