use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestXianDaiHanYu method testMakeNatureDictionary.
public void testMakeNatureDictionary() throws Exception {
String text = IOUtil.readTxt("D:\\Doc\\语料库\\现代汉语词典(第五版)全文_更新.txt").toLowerCase();
// String text = "【岸标】ànbiāo名设在岸上指示航行的标志,可以使船舶避开沙滩、暗礁等。\n" +
// "\n" +
// "【岸炮】ànpào名海岸炮的简称。\n" +
// "\n" +
// "【岸然】ànrán〈书〉形严肃的样子:道貌~。\n" +
// "\n" +
// "【按】1àn①动用手或指头压:~电铃|~图钉。②动压住;搁下:~兵不动|~下此事不说。③动抑制:~不住心头怒火。④介依照:~时|~质论价|~制度办事|~每人两本计算。\n" +
// "另见237页cuō。\n" +
// "现用替代字【錣】* 原图片字[钅+叕]\n" +
// "现用替代字【騣】* 原图片字[马+㚇]\n" +
// "现用替代字【緅】* 原图片字[纟+取]";
Pattern pattern = Pattern.compile("【([\\u4E00-\\u9FA5]{2,10})】.{0,5}([abcdefghijklmnopqrstuwxyzāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ∥•’-]+)(.*)");
Matcher matcher = pattern.matcher(text);
DictionaryMaker dictionaryMaker = new DictionaryMaker();
dictionaryMaker.add("希望 v 7685 vn 616");
Map<String, String> mapChineseToNature = new TreeMap<String, String>();
mapChineseToNature.put("名", Nature.n.toString());
mapChineseToNature.put("动", Nature.v.toString());
mapChineseToNature.put("形", Nature.a.toString());
mapChineseToNature.put("副", Nature.d.toString());
mapChineseToNature.put("形容", Nature.a.toString());
while (matcher.find()) {
String word = matcher.group(1);
if (CoreDictionary.contains(word) || CustomDictionary.contains(word))
continue;
String content = matcher.group(3);
Item item = new Item(word);
for (Map.Entry<String, String> entry : mapChineseToNature.entrySet()) {
int frequency = TextUtility.count(entry.getKey(), content);
if (frequency > 0)
item.addLabel(entry.getValue(), frequency);
}
if (item.getTotalFrequency() == 0)
item.addLabel(Nature.nz.toString());
// System.out.println(item);
dictionaryMaker.add(item);
}
dictionaryMaker.saveTxtTo("data/dictionary/custom/现代汉语补充词库.txt");
}
use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestCustomDictionary method testRemoveNotNS.
/**
* data/dictionary/custom/全国地名大全.txt中有很多人名,删掉它们
* @throws Exception
*/
public void testRemoveNotNS() throws Exception {
String path = "data/dictionary/custom/全国地名大全.txt";
final Set<Character> suffixSet = new TreeSet<Character>();
for (char c : Predefine.POSTFIX_SINGLE.toCharArray()) {
suffixSet.add(c);
}
DictionaryMaker.load(path).saveTxtTo(path, new DictionaryMaker.Filter() {
Segment segment = HanLP.newSegment().enableCustomDictionary(false);
@Override
public boolean onSave(Item item) {
if (suffixSet.contains(item.key.charAt(item.key.length() - 1)))
return true;
List<Term> termList = segment.seg(item.key);
if (termList.size() == 1 && termList.get(0).nature == Nature.nr) {
System.out.println(item);
return false;
}
return true;
}
});
}
use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestDependencyCorpus method testPosTag.
/**
* 细粒度转粗粒度
*
* @throws Exception
*/
public void testPosTag() throws Exception {
DictionaryMaker dictionaryMaker = new DictionaryMaker();
LinkedList<CoNLLSentence> coNLLSentences = CoNLLLoader.loadSentenceList("D:\\Doc\\语料库\\依存分析训练数据\\THU\\dev.conll.fixed.txt");
for (CoNLLSentence coNLLSentence : coNLLSentences) {
for (CoNLLWord coNLLWord : coNLLSentence.word) {
dictionaryMaker.add(new Item(coNLLWord.POSTAG, coNLLWord.CPOSTAG));
}
}
System.out.println(dictionaryMaker.entrySet());
}
use of com.hankcs.hanlp.corpus.dictionary.item.Item in project HanLP by hankcs.
the class TestNTRecognition method testRemoveP.
public void testRemoveP() throws Exception {
DictionaryMaker maker = DictionaryMaker.load(HanLP.Config.OrganizationDictionaryPath);
for (Map.Entry<String, Item> entry : maker.entrySet()) {
String word = entry.getKey();
Item item = entry.getValue();
CoreDictionary.Attribute attribute = LexiconUtility.getAttribute(word);
if (attribute == null)
continue;
if (item.containsLabel("P") && attribute.hasNatureStartsWith("u")) {
System.out.println(item + "\t" + attribute);
item.removeLabel("P");
}
}
maker.saveTxtTo(HanLP.Config.OrganizationDictionaryPath);
}
Aggregations