use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class NTDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
int i = 0;
for (List<IWord> wordList : sentenceList) {
Precompiler.compileWithoutNT(wordList);
if (verbose) {
System.out.print(++i + " / " + sentenceList.size() + " ");
System.out.println("原始语料 " + wordList);
}
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
wordLinkedList.addLast(new Word(Predefine.TAG_END, "Z"));
if (verbose)
System.out.println("添加首尾 " + wordList);
// 标注上文
Iterator<IWord> iterator = wordLinkedList.iterator();
IWord pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("nt") && !pre.getLabel().startsWith("nt")) {
pre.setLabel(NT.A.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注上文 " + wordList);
// 标注下文
iterator = wordLinkedList.descendingIterator();
pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("nt") && !pre.getLabel().startsWith("nt")) {
pre.setLabel(NT.B.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注下文 " + wordList);
// 标注中间
{
iterator = wordLinkedList.iterator();
IWord first = iterator.next();
IWord second = iterator.next();
while (iterator.hasNext()) {
IWord third = iterator.next();
if (first.getLabel().startsWith("nt") && third.getLabel().startsWith("nt") && !second.getLabel().startsWith("nt")) {
second.setLabel(NT.X.toString());
}
first = second;
second = third;
}
if (verbose)
System.out.println("标注中间 " + wordList);
}
// 处理整个
ListIterator<IWord> listIterator = wordLinkedList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
String label = word.getLabel();
if (label.equals(label.toUpperCase()))
continue;
if (label.startsWith("nt")) {
StringBuilder sbPattern = new StringBuilder();
// 复杂机构
if (word instanceof CompoundWord) {
listIterator.remove();
Word last = null;
for (Word inner : ((CompoundWord) word).innerList) {
last = inner;
String innerLabel = inner.label;
if (innerLabel.startsWith("ns")) {
inner.setValue(Predefine.TAG_PLACE);
inner.setLabel(NT.G.toString());
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.startsWith("nt")) {
inner.value = Predefine.TAG_GROUP;
inner.label = NT.K.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.equals("b") || innerLabel.equals("ng") || innerLabel.equals("j")) {
inner.label = NT.J.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("n".equals(innerLabel) || "an".equals(innerLabel) || "a".equals(innerLabel) || "vn".equals(innerLabel) || "vd".equals(innerLabel) || "vl".equals(innerLabel) || "v".equals(innerLabel) || "vi".equals(innerLabel) || "nnt".equals(innerLabel) || "nnd".equals(innerLabel) || "nf".equals(innerLabel) || "cc".equals(innerLabel) || "t".equals(innerLabel) || "z".equals(innerLabel)) {
inner.label = NT.C.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("nz".equals(innerLabel)) {
inner.label = NT.I.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("m".equals(innerLabel)) {
inner.value = Predefine.TAG_NUMBER;
inner.label = NT.M.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("w".equals(innerLabel)) {
inner.label = NT.W.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.startsWith("nr") || "x".equals(innerLabel) || "nx".equals(innerLabel)) {
inner.value = Predefine.TAG_PEOPLE;
inner.label = NT.F.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.startsWith("ni")) {
inner.label = NT.D.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("f".equals(innerLabel) || "s".equals(innerLabel)) {
inner.label = NT.L.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else {
inner.label = NT.P.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
}
}
if (last != null) {
last.label = NT.D.toString();
sbPattern.deleteCharAt(sbPattern.length() - 1);
sbPattern.append(last.label);
tfDictionary.add(sbPattern.toString());
sbPattern.setLength(0);
}
} else {
word.setLabel(NT.K.toString());
}
} else {
word.setLabel(NT.Z.toString());
}
}
if (verbose)
System.out.println("处理整个 " + wordList);
wordLinkedList.getFirst().setLabel(NT.S.toString());
}
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class NRCorpusLoader method load.
public static boolean load(String path) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
String line;
DictionaryMaker dictionaryMaker = new DictionaryMaker();
while ((line = br.readLine()) != null) {
if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
continue;
// 只载入两字和三字的名字
Integer length = line.length();
switch(length) {
case 2:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
Word wordE = new Word(line.substring(1), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordE);
break;
}
case 3:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
Word wordC = new Word(line.substring(1, 2), NR.C.toString());
Word wordD = new Word(line.substring(2, 3), NR.D.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordC);
dictionaryMaker.add(wordD);
break;
}
default:
// L.trace("放弃【{}】", line);
break;
}
}
br.close();
logger.info(dictionaryMaker.toString());
dictionaryMaker.saveTxtTo("data/dictionary/person/name.txt", new DictionaryMaker.Filter() {
@Override
public boolean onSave(Item item) {
return false;
}
});
} catch (Exception e) {
logger.warning("读取" + path + "发生错误");
return false;
}
return true;
}
Aggregations