use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestDictionaryMaker method testLoadItemList.
public void testLoadItemList() throws Exception {
List<Item> itemList = DictionaryMaker.loadAsItemList("data/2014_dictionary.txt");
Map<String, Integer> labelMap = new TreeMap<String, Integer>();
for (Item item : itemList) {
for (Map.Entry<String, Integer> entry : item.labelMap.entrySet()) {
Integer frequency = labelMap.get(entry.getKey());
if (frequency == null)
frequency = 0;
labelMap.put(entry.getKey(), frequency + entry.getValue());
}
}
for (String label : labelMap.keySet()) {
System.out.println(label);
}
System.out.println(labelMap.size());
}
use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestAdjustCoreDictionary method testRemoveNumber.
public void testRemoveNumber() throws Exception {
// 一些汉字数词留着没用,除掉它们
DictionaryMaker dictionaryMaker = DictionaryMaker.load(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT);
dictionaryMaker.saveTxtTo(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT, new DictionaryMaker.Filter() {
@Override
public boolean onSave(Item item) {
if (item.key.length() == 1 && "0123456789零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".indexOf(item.key.charAt(0)) >= 0) {
System.out.println(item);
return false;
}
return true;
}
});
}
use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestAdjustCoreDictionary method testGetCompiledWordFromDictionary.
public void testGetCompiledWordFromDictionary() throws Exception {
DictionaryMaker dictionaryMaker = DictionaryMaker.load("data/test/CoreNatureDictionary.txt");
for (Map.Entry<String, Item> entry : dictionaryMaker.entrySet()) {
String word = entry.getKey();
Item item = entry.getValue();
if (word.matches(".##.")) {
System.out.println(item);
}
}
}
use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestXianDaiHanYu method testMakeNatureDictionary.
public void testMakeNatureDictionary() throws Exception {
String text = IOUtil.readTxt("D:\\Doc\\语料库\\现代汉语词典(第五版)全文_更新.txt").toLowerCase();
// String text = "【岸标】ànbiāo名设在岸上指示航行的标志,可以使船舶避开沙滩、暗礁等。\n" +
// "\n" +
// "【岸炮】ànpào名海岸炮的简称。\n" +
// "\n" +
// "【岸然】ànrán〈书〉形严肃的样子:道貌~。\n" +
// "\n" +
// "【按】1àn①动用手或指头压:~电铃|~图钉。②动压住;搁下:~兵不动|~下此事不说。③动抑制:~不住心头怒火。④介依照:~时|~质论价|~制度办事|~每人两本计算。\n" +
// "另见237页cuō。\n" +
// "现用替代字【錣】* 原图片字[钅+叕]\n" +
// "现用替代字【騣】* 原图片字[马+㚇]\n" +
// "现用替代字【緅】* 原图片字[纟+取]";
Pattern pattern = Pattern.compile("【([\\u4E00-\\u9FA5]{2,10})】.{0,5}([abcdefghijklmnopqrstuwxyzāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ∥•’-]+)(.*)");
Matcher matcher = pattern.matcher(text);
DictionaryMaker dictionaryMaker = new DictionaryMaker();
dictionaryMaker.add("希望 v 7685 vn 616");
Map<String, String> mapChineseToNature = new TreeMap<String, String>();
mapChineseToNature.put("名", Nature.n.toString());
mapChineseToNature.put("动", Nature.v.toString());
mapChineseToNature.put("形", Nature.a.toString());
mapChineseToNature.put("副", Nature.d.toString());
mapChineseToNature.put("形容", Nature.a.toString());
while (matcher.find()) {
String word = matcher.group(1);
if (CoreDictionary.contains(word) || CustomDictionary.contains(word))
continue;
String content = matcher.group(3);
Item item = new Item(word);
for (Map.Entry<String, String> entry : mapChineseToNature.entrySet()) {
int frequency = TextUtility.count(entry.getKey(), content);
if (frequency > 0)
item.addLabel(entry.getValue(), frequency);
}
if (item.getTotalFrequency() == 0)
item.addLabel(Nature.nz.toString());
// System.out.println(item);
dictionaryMaker.add(item);
}
dictionaryMaker.saveTxtTo("data/dictionary/custom/现代汉语补充词库.txt");
}
use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestCustomDictionary method testRemoveNotNS.
/**
* data/dictionary/custom/全国地名大全.txt中有很多人名,删掉它们
* @throws Exception
*/
public void testRemoveNotNS() throws Exception {
String path = "data/dictionary/custom/全国地名大全.txt";
final Set<Character> suffixSet = new TreeSet<Character>();
for (char c : Predefine.POSTFIX_SINGLE.toCharArray()) {
suffixSet.add(c);
}
DictionaryMaker.load(path).saveTxtTo(path, new DictionaryMaker.Filter() {
Segment segment = HanLP.newSegment().enableCustomDictionary(false);
@Override
public boolean onSave(Item item) {
if (suffixSet.contains(item.key.charAt(item.key.length() - 1)))
return true;
List<Term> termList = segment.seg(item.key);
if (termList.size() == 1 && termList.get(0).nature == Nature.nr) {
System.out.println(item);
return false;
}
return true;
}
});
}
Aggregations