use of com.hankcs.hanlp.corpus.dictionary.TFDictionary in project HanLP by hankcs.
the class SimplifyNGramDictionary method testSimplify.
public void testSimplify() throws Exception {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
TreeMap<String, Integer> map = new TreeMap<String, Integer>();
String line;
while ((line = br.readLine()) != null) {
String[] param = line.split("\\s");
map.put(param[0], Integer.valueOf(param[1]));
}
br.close();
Set<Map.Entry<String, Integer>> entrySet = map.descendingMap().entrySet();
Iterator<Map.Entry<String, Integer>> iterator = entrySet.iterator();
// 第一步去包含
// Map.Entry<String, Integer> pre = new AbstractMap.SimpleEntry<>(" @ ", 1);
// while (iterator.hasNext())
// {
// Map.Entry<String, Integer> current = iterator.next();
// if (current.getKey().length() - current.getKey().indexOf('@') == 2 && pre.getKey().indexOf(current.getKey()) == 0 && current.getValue() <= 2)
// {
// System.out.println("应当删除 " + current + " 保留 " + pre);
// iterator.remove();
// }
// pre = current;
// }
// 第二步,尝试移除“学@家”这样的短共现
// iterator = entrySet.iterator();
// while (iterator.hasNext())
// {
// Map.Entry<String, Integer> current = iterator.next();
// if (current.getKey().length() == 3)
// {
// System.out.println("应当删除 " + current);
// }
// }
// 第三步,对某些@后面的词语太短了,也移除
// iterator = entrySet.iterator();
// while (iterator.hasNext())
// {
// Map.Entry<String, Integer> current = iterator.next();
// String[] termArray = current.getKey().split("@", 2);
// if (termArray[0].equals("未##人") && termArray[1].length() < 2)
// {
// System.out.println("删除 " + current.getKey());
// iterator.remove();
// }
// }
// 第四步,人名接续对识别产生太多误命中影响,也删除
// iterator = entrySet.iterator();
// while (iterator.hasNext())
// {
// Map.Entry<String, Integer> current = iterator.next();
// if (current.getKey().contains("未##人") && current.getValue() < 10)
// {
// System.out.println("删除 " + current.getKey());
// iterator.remove();
// }
// }
// 对人名的终极调优
TFDictionary dictionary = new TFDictionary();
dictionary.load("D:\\JavaProjects\\HanLP\\data\\dictionary\\CoreNatureDictionary.ngram.mini.txt");
iterator = entrySet.iterator();
while (iterator.hasNext()) {
Map.Entry<String, Integer> current = iterator.next();
if (current.getKey().contains("未##人") && dictionary.getFrequency(current.getKey()) < 10) {
System.out.println("删除 " + current.getKey());
iterator.remove();
}
}
// 输出
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path)));
for (Map.Entry<String, Integer> entry : map.entrySet()) {
bw.write(entry.getKey());
bw.write(' ');
bw.write(String.valueOf(entry.getValue()));
bw.newLine();
}
bw.close();
}
use of com.hankcs.hanlp.corpus.dictionary.TFDictionary in project HanLP by hankcs.
the class AdjustCorpus method testPlay.
public void testPlay() throws Exception {
final TFDictionary tfDictionary = new TFDictionary();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
for (List<IWord> wordList : document.getComplexSentenceList()) {
for (IWord word : wordList) {
if (word instanceof CompoundWord && word.getLabel().equals("ns")) {
tfDictionary.add(word.toString());
}
}
}
}
});
tfDictionary.saveTxtTo("data/test/complex_ns.txt");
}
use of com.hankcs.hanlp.corpus.dictionary.TFDictionary in project HanLP by hankcs.
the class TestAdjustCoreDictionary method testViewNGramDictionary.
public void testViewNGramDictionary() throws Exception {
TFDictionary tfDictionary = new TFDictionary();
tfDictionary.load("data/dictionary/CoreNatureDictionary.ngram.txt");
for (Map.Entry<String, TermFrequency> entry : tfDictionary.entrySet()) {
String word = entry.getKey();
TermFrequency frequency = entry.getValue();
if (word.contains("##")) {
System.out.println(frequency);
}
}
}
Aggregations