use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class WordNatureWeightModelMaker method addPair.
private static void addPair(String from, String to, String label, DictionaryMaker dictionaryMaker) {
dictionaryMaker.add(new Word(from + "@" + to, label));
dictionaryMaker.add(new Word(from + "@", "频次"));
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class TestICWB method testDumpPeople2014ToBEMS.
public void testDumpPeople2014ToBEMS() throws Exception {
final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014.txt")));
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<Word>> simpleSentenceList = document.getSimpleSentenceList();
for (List<Word> wordList : simpleSentenceList) {
try {
for (Word word : wordList) {
bw.write(word.value);
bw.write(' ');
}
bw.newLine();
} catch (IOException e) {
e.printStackTrace();
}
}
}
});
bw.close();
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class NameDictionaryMaker method create.
public static DictionaryMaker create(String path) {
DictionaryMaker dictionaryMaker = new DictionaryMaker();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
continue;
// 只载入两字和三字的名字
Integer length = line.length();
switch(length) {
case 2:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
if (!FamilyName.contains(wordB.value))
break;
Word wordE = new Word(line.substring(1), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordE);
break;
}
case 3:
{
Word wordB = new Word(line.substring(0, 1), NR.B.toString());
if (!FamilyName.contains(wordB.value))
break;
Word wordC = new Word(line.substring(1, 2), NR.C.toString());
Word wordD = new Word(line.substring(2, 3), NR.D.toString());
// Word wordC = new Word(line.substring(1, 2), NR.E.toString());
// Word wordD = new Word(line.substring(2, 3), NR.E.toString());
dictionaryMaker.add(wordB);
dictionaryMaker.add(wordC);
dictionaryMaker.add(wordD);
break;
}
default:
// L.trace("放弃【{}】", line);
break;
}
}
br.close();
logger.info(dictionaryMaker.toString());
} catch (Exception e) {
logger.warning("读取" + path + "发生错误");
return null;
}
return dictionaryMaker;
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class NRDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
logger.info("开始标注角色");
int i = 0;
for (List<IWord> wordList : sentenceList) {
logger.info(++i + " / " + sentenceList.size());
if (verbose)
System.out.println("原始语料 " + wordList);
// 先标注A和K
IWord pre = new Word("##始##", "begin");
ListIterator<IWord> listIterator = wordList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (!word.getLabel().equals(Nature.nr.toString())) {
word.setLabel(NR.A.toString());
} else {
if (!pre.getLabel().equals(Nature.nr.toString())) {
pre.setLabel(NR.K.toString());
}
}
pre = word;
}
if (verbose)
System.out.println("标注非前 " + wordList);
// 然后标注LM
IWord next = new Word("##末##", "end");
while (listIterator.hasPrevious()) {
IWord word = listIterator.previous();
if (word.getLabel().equals(Nature.nr.toString())) {
String label = next.getLabel();
if (label.equals("A"))
next.setLabel("L");
else if (label.equals("K"))
next.setLabel("M");
}
next = word;
}
if (verbose)
System.out.println("标注中后 " + wordList);
// 拆分名字
listIterator = wordList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (word.getLabel().equals(Nature.nr.toString())) {
switch(word.getValue().length()) {
case 2:
if (word.getValue().startsWith("大") || word.getValue().startsWith("老") || word.getValue().startsWith("小")) {
listIterator.add(new Word(word.getValue().substring(1, 2), NR.B.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.F.toString());
} else if (word.getValue().endsWith("哥") || word.getValue().endsWith("公") || word.getValue().endsWith("姐") || word.getValue().endsWith("老") || word.getValue().endsWith("某") || word.getValue().endsWith("嫂") || word.getValue().endsWith("氏") || word.getValue().endsWith("总")) {
listIterator.add(new Word(word.getValue().substring(1, 2), NR.G.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.B.toString());
} else {
listIterator.add(new Word(word.getValue().substring(1, 2), NR.E.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.B.toString());
}
break;
case 3:
listIterator.add(new Word(word.getValue().substring(1, 2), NR.C.toString()));
listIterator.add(new Word(word.getValue().substring(2, 3), NR.D.toString()));
word.setValue(word.getValue().substring(0, 1));
word.setLabel(NR.B.toString());
break;
}
}
}
if (verbose)
System.out.println("姓名拆分 " + wordList);
// 上文成词
listIterator = wordList.listIterator();
pre = new Word("##始##", "begin");
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (word.getLabel().equals(NR.B.toString())) {
String combine = pre.getValue() + word.getValue();
if (dictionary.contains(combine)) {
pre.setValue(combine);
pre.setLabel("U");
listIterator.remove();
}
}
pre = word;
}
if (verbose)
System.out.println("上文成词 " + wordList);
// 头部成词
next = new Word("##末##", "end");
while (listIterator.hasPrevious()) {
IWord word = listIterator.previous();
if (word.getLabel().equals(NR.B.toString())) {
String combine = word.getValue() + next.getValue();
if (dictionary.contains(combine)) {
next.setValue(combine);
next.setLabel(next.getLabel().equals(NR.C.toString()) ? NR.X.toString() : NR.Y.toString());
listIterator.remove();
}
}
next = word;
}
if (verbose)
System.out.println("头部成词 " + wordList);
// 尾部成词
pre = new Word("##始##", "begin");
while (listIterator.hasNext()) {
IWord word = listIterator.next();
if (word.getLabel().equals(NR.D.toString())) {
String combine = pre.getValue() + word.getValue();
if (dictionary.contains(combine)) {
pre.setValue(combine);
pre.setLabel(NR.Z.toString());
listIterator.remove();
}
}
pre = word;
}
if (verbose)
System.out.println("尾部成词 " + wordList);
// 下文成词
next = new Word("##末##", "end");
while (listIterator.hasPrevious()) {
IWord word = listIterator.previous();
if (word.getLabel().equals(NR.D.toString())) {
String combine = word.getValue() + next.getValue();
if (dictionary.contains(combine)) {
next.setValue(combine);
next.setLabel(NR.V.toString());
listIterator.remove();
}
}
next = word;
}
if (verbose)
System.out.println("头部成词 " + wordList);
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
wordLinkedList.addLast(new Word(Predefine.TAG_END, "A"));
if (verbose)
System.out.println("添加首尾 " + wordList);
}
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class NatureDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
logger.info("开始标注");
int i = 0;
for (List<IWord> wordList : sentenceList) {
logger.info(++i + " / " + sentenceList.size());
for (IWord word : wordList) {
// 编译为等效字符串
Precompiler.compile(word);
}
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, Nature.begin.toString()));
wordLinkedList.addLast(new Word(Predefine.TAG_END, Nature.end.toString()));
}
}
Aggregations