use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class NRCorpusLoader method combine.
public static void combine() {
DictionaryMaker dictionaryMaker = DictionaryMaker.combine(HanLP.Config.CoreDictionaryPath, "XXXDictionary.txt");
dictionaryMaker.saveTxtTo(HanLP.Config.CoreDictionaryPath);
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class NameDictionaryMaker method create.
public static DictionaryMaker create(String path) {
DictionaryMaker dictionaryMaker = new DictionaryMaker();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
continue;
// 只载入两字和三字的名字
Integer length = line.length();
switch(length) {
case 2:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
if (!FamilyName.contains(wordB.value))
break;
Word wordE = new Word(line.substring(1), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordE);
break;
}
case 3:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
if (!FamilyName.contains(wordB.value))
break;
Word wordC = new Word(line.substring(1, 2), NR.C.toString());
Word wordD = new Word(line.substring(2, 3), NR.D.toString());
// Word wordC = new Word(line.substring(1, 2), NR.E.toString());
// Word wordD = new Word(line.substring(2, 3), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordC);
dictionaryMaker.add(wordD);
break;
}
default:
// L.trace("放弃【{}】", line);
break;
}
}
br.close();
logger.info(dictionaryMaker.toString());
} catch (Exception e) {
logger.warning("读取" + path + "发生错误");
return null;
}
return dictionaryMaker;
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class TestCorpusLoader method testMakeOrganizationCustomDictionary.
public void testMakeOrganizationCustomDictionary() throws Exception {
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
for (List<IWord> wordList : complexSentenceList) {
for (IWord word : wordList) {
if (word.getLabel().startsWith("nt")) {
dictionaryMaker.add(word);
}
}
}
}
});
dictionaryMaker.saveTxtTo("data/dictionary/custom/机构名词典.txt");
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class TestCustomDictionary method testRemoveJunkWord.
public void testRemoveJunkWord() throws Exception {
DictionaryMaker dictionaryMaker = DictionaryMaker.load("data/dictionary/custom/CustomDictionary.txt");
dictionaryMaker.saveTxtTo("data/dictionary/custom/CustomDictionary.txt", new DictionaryMaker.Filter() {
@Override
public boolean onSave(Item item) {
if (item.containsLabel("mq") || item.containsLabel("m") || item.containsLabel("t")) {
return false;
}
return true;
}
});
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class TestAdjustCoreDictionary method testRemoveNumber.
public void testRemoveNumber() throws Exception {
// 一些汉字数词留着没用,除掉它们
DictionaryMaker dictionaryMaker = DictionaryMaker.load(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT);
dictionaryMaker.saveTxtTo(DATA_DICTIONARY_CORE_NATURE_DICTIONARY_TXT, new DictionaryMaker.Filter() {
@Override
public boolean onSave(Item item) {
if (item.key.length() == 1 && "0123456789零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟".indexOf(item.key.charAt(0)) >= 0) {
System.out.println(item);
return false;
}
return true;
}
});
}
Aggregations