use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class TestCorpusLoader method testMakeOrganizationCustomDictionary.
public void testMakeOrganizationCustomDictionary() throws Exception {
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
for (List<IWord> wordList : complexSentenceList) {
for (IWord word : wordList) {
if (word.getLabel().startsWith("nt")) {
dictionaryMaker.add(word);
}
}
}
}
});
dictionaryMaker.saveTxtTo("data/dictionary/custom/机构名词典.txt");
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class TestCorpusLoader method testMakePersonCustomDictionary.
public void testMakePersonCustomDictionary() throws Exception {
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
for (List<IWord> wordList : complexSentenceList) {
for (IWord word : wordList) {
if (word.getLabel().startsWith("nr")) {
dictionaryMaker.add(word);
}
}
}
}
});
dictionaryMaker.saveTxtTo("data/dictionary/custom/人名词典.txt");
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class NRDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
logger.info("开始标注角色");
int i = 0;
for (List<IWord> wordList : sentenceList) {
logger.info(++i + " / " + sentenceList.size());
if (verbose)
System.out.println("原始语料 " + wordList);
// 先标注A和K
IWord pre = new Word("##始##", "begin");
ListIterator<IWord> listIterator = wordList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (!word.getLabel().equals(Nature.nr.toString())) {
word.setLabel(NR.A.toString());
} else {
if (!pre.getLabel().equals(Nature.nr.toString())) {
pre.setLabel(NR.K.toString());
}
}
pre = word;
}
if (verbose)
System.out.println("标注非前 " + wordList);
// 然后标注LM
IWord next = new Word("##末##", "end");
while (listIterator.hasPrevious()) {
IWord word = listIterator.previous();
if (word.getLabel().equals(Nature.nr.toString())) {
String label = next.getLabel();
if (label.equals("A"))
next.setLabel("L");
else if (label.equals("K"))
next.setLabel("M");
}
next = word;
}
if (verbose)
System.out.println("标注中后 " + wordList);
// 拆分名字
listIterator = wordList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (word.getLabel().equals(Nature.nr.toString())) {
switch(word.getValue().length()) {
case 2:
if (word.getValue().startsWith("大") || word.getValue().startsWith("老") || word.getValue().startsWith("小")) {
listIterator.add(new Word(word.getValue().substring(1, 2), NR.B.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.F.toString());
} else if (word.getValue().endsWith("哥") || word.getValue().endsWith("公") || word.getValue().endsWith("姐") || word.getValue().endsWith("老") || word.getValue().endsWith("某") || word.getValue().endsWith("嫂") || word.getValue().endsWith("氏") || word.getValue().endsWith("总")) {
listIterator.add(new Word(word.getValue().substring(1, 2), NR.G.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.B.toString());
} else {
listIterator.add(new Word(word.getValue().substring(1, 2), NR.E.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.B.toString());
}
break;
case 3:
listIterator.add(new Word(word.getValue().substring(1, 2), NR.C.toString()));
listIterator.add(new Word(word.getValue().substring(2, 3), NR.D.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.B.toString());
break;
}
}
}
if (verbose)
System.out.println("姓名拆分 " + wordList);
// 上文成词
listIterator = wordList.listIterator();
pre = new Word("##始##", "begin");
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (word.getLabel().equals(NR.B.toString())) {
String combine = pre.getValue() + word.getValue();
if (dictionary.contains(combine)) {
pre.setValue(combine);
pre.setLabel("U");
listIterator.remove();
}
}
pre = word;
}
if (verbose)
System.out.println("上文成词 " + wordList);
// 头部成词
next = new Word("##末##", "end");
while (listIterator.hasPrevious()) {
IWord word = listIterator.previous();
if (word.getLabel().equals(NR.B.toString())) {
String combine = word.getValue() + next.getValue();
if (dictionary.contains(combine)) {
next.setValue(combine);
next.setLabel(next.getLabel().equals(NR.C.toString()) ? NR.X.toString() : NR.Y.toString());
listIterator.remove();
}
}
next = word;
}
if (verbose)
System.out.println("头部成词 " + wordList);
// 尾部成词
pre = new Word("##始##", "begin");
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (word.getLabel().equals(NR.D.toString())) {
String combine = pre.getValue() + word.getValue();
if (dictionary.contains(combine)) {
pre.setValue(combine);
pre.setLabel(NR.Z.toString());
listIterator.remove();
}
}
pre = word;
}
if (verbose)
System.out.println("尾部成词 " + wordList);
// 下文成词
next = new Word("##末##", "end");
while (listIterator.hasPrevious()) {
IWord word = listIterator.previous();
if (word.getLabel().equals(NR.D.toString())) {
String combine = word.getValue() + next.getValue();
if (dictionary.contains(combine)) {
next.setValue(combine);
next.setLabel(NR.V.toString());
listIterator.remove();
}
}
next = word;
}
if (verbose)
System.out.println("头部成词 " + wordList);
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
wordLinkedList.addLast(new Word(Predefine.TAG_END, "A"));
if (verbose)
System.out.println("添加首尾 " + wordList);
}
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class NatureDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
logger.info("开始标注");
int i = 0;
for (List<IWord> wordList : sentenceList) {
logger.info(++i + " / " + sentenceList.size());
for (IWord word : wordList) {
// 编译为等效字符串
Precompiler.compile(word);
}
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, Nature.begin.toString()));
wordLinkedList.addLast(new Word(Predefine.TAG_END, Nature.end.toString()));
}
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class Document method getSimpleWordList.
public List<Word> getSimpleWordList() {
List<IWord> wordList = getWordList();
List<Word> simpleWordList = new LinkedList<Word>();
for (IWord word : wordList) {
if (word instanceof CompoundWord) {
simpleWordList.addAll(((CompoundWord) word).innerList);
} else {
simpleWordList.add((Word) word);
}
}
return simpleWordList;
}
Aggregations