use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class NRCorpusLoader method combine.
public static void combine() {
DictionaryMaker dictionaryMaker = DictionaryMaker.combine(HanLP.Config.CoreDictionaryPath, "XXXDictionary.txt");
dictionaryMaker.saveTxtTo(HanLP.Config.CoreDictionaryPath);
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class NameDictionaryMaker method create.
public static DictionaryMaker create(String path) {
DictionaryMaker dictionaryMaker = new DictionaryMaker();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
continue;
// 只载入两字和三字的名字
Integer length = line.length();
switch(length) {
case 2:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
if (!FamilyName.contains(wordB.value))
break;
Word wordE = new Word(line.substring(1), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordE);
break;
}
case 3:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
if (!FamilyName.contains(wordB.value))
break;
Word wordC = new Word(line.substring(1, 2), NR.C.toString());
Word wordD = new Word(line.substring(2, 3), NR.D.toString());
// Word wordC = new Word(line.substring(1, 2), NR.E.toString());
// Word wordD = new Word(line.substring(2, 3), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordC);
dictionaryMaker.add(wordD);
break;
}
default:
// L.trace("放弃【{}】", line);
break;
}
}
br.close();
logger.info(dictionaryMaker.toString());
} catch (Exception e) {
logger.warning("读取" + path + "发生错误");
return null;
}
return dictionaryMaker;
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class WordNatureWeightModelMaker method makeModel.
public static boolean makeModel(String corpusLoadPath, String modelSavePath) {
Set<String> posSet = new TreeSet<String>();
DictionaryMaker dictionaryMaker = new DictionaryMaker();
for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
for (CoNLLWord word : sentence.word) {
addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
posSet.add(word.POSTAG);
}
}
for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath)) {
for (CoNLLWord word : sentence.word) {
addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(word.NAME, wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
posSet.add(word.POSTAG);
}
}
StringBuilder sb = new StringBuilder();
for (String pos : posSet) {
sb.append("case \"" + pos + "\":\n");
}
IOUtil.saveTxt("data/model/dependency/pos-thu.txt", sb.toString());
return dictionaryMaker.saveTxtTo(modelSavePath);
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class NRCorpusLoader method load.
public static boolean load(String path) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
String line;
DictionaryMaker dictionaryMaker = new DictionaryMaker();
while ((line = br.readLine()) != null) {
if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
continue;
// 只载入两字和三字的名字
Integer length = line.length();
switch(length) {
case 2:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
Word wordE = new Word(line.substring(1), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordE);
break;
}
case 3:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
Word wordC = new Word(line.substring(1, 2), NR.C.toString());
Word wordD = new Word(line.substring(2, 3), NR.D.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordC);
dictionaryMaker.add(wordD);
break;
}
default:
// L.trace("放弃【{}】", line);
break;
}
}
br.close();
logger.info(dictionaryMaker.toString());
dictionaryMaker.saveTxtTo("data/dictionary/person/name.txt", new DictionaryMaker.Filter() {
@Override
public boolean onSave(Item item) {
return false;
}
});
} catch (Exception e) {
logger.warning("读取" + path + "发生错误");
return false;
}
return true;
}
use of com.hankcs.hanlp.corpus.dictionary.DictionaryMaker in project HanLP by hankcs.
the class TestXianDaiHanYu method testMakeNatureDictionary.
public void testMakeNatureDictionary() throws Exception {
String text = IOUtil.readTxt("D:\\Doc\\语料库\\现代汉语词典(第五版)全文_更新.txt").toLowerCase();
// String text = "【岸标】ànbiāo名设在岸上指示航行的标志,可以使船舶避开沙滩、暗礁等。\n" +
// "\n" +
// "【岸炮】ànpào名海岸炮的简称。\n" +
// "\n" +
// "【岸然】ànrán〈书〉形严肃的样子:道貌~。\n" +
// "\n" +
// "【按】1àn①动用手或指头压:~电铃|~图钉。②动压住;搁下:~兵不动|~下此事不说。③动抑制:~不住心头怒火。④介依照:~时|~质论价|~制度办事|~每人两本计算。\n" +
// "另见237页cuō。\n" +
// "现用替代字【錣】* 原图片字[钅+叕]\n" +
// "现用替代字【騣】* 原图片字[马+㚇]\n" +
// "现用替代字【緅】* 原图片字[纟+取]";
Pattern pattern = Pattern.compile("【([\\u4E00-\\u9FA5]{2,10})】.{0,5}([abcdefghijklmnopqrstuwxyzāáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ∥•’-]+)(.*)");
Matcher matcher = pattern.matcher(text);
DictionaryMaker dictionaryMaker = new DictionaryMaker();
dictionaryMaker.add("希望 v 7685 vn 616");
Map<String, String> mapChineseToNature = new TreeMap<String, String>();
mapChineseToNature.put("名", Nature.n.toString());
mapChineseToNature.put("动", Nature.v.toString());
mapChineseToNature.put("形", Nature.a.toString());
mapChineseToNature.put("副", Nature.d.toString());
mapChineseToNature.put("形容", Nature.a.toString());
while (matcher.find()) {
String word = matcher.group(1);
if (CoreDictionary.contains(word) || CustomDictionary.contains(word))
continue;
String content = matcher.group(3);
Item item = new Item(word);
for (Map.Entry<String, String> entry : mapChineseToNature.entrySet()) {
int frequency = TextUtility.count(entry.getKey(), content);
if (frequency > 0)
item.addLabel(entry.getValue(), frequency);
}
if (item.getTotalFrequency() == 0)
item.addLabel(Nature.nz.toString());
// System.out.println(item);
dictionaryMaker.add(item);
}
dictionaryMaker.saveTxtTo("data/dictionary/custom/现代汉语补充词库.txt");
}
Aggregations