use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class NSDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
int i = 0;
for (List<IWord> wordList : sentenceList) {
Precompiler.compileWithoutNS(wordList);
if (verbose) {
System.out.print(++i + " / " + sentenceList.size() + " ");
System.out.println("原始语料 " + wordList);
}
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
wordLinkedList.addLast(new Word(Predefine.TAG_END, "Z"));
if (verbose)
System.out.println("添加首尾 " + wordList);
// 标注上文
Iterator<IWord> iterator = wordLinkedList.iterator();
IWord pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
pre.setLabel(NS.A.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注上文 " + wordList);
// 标注下文
iterator = wordLinkedList.descendingIterator();
pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
pre.setLabel(NS.B.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注下文 " + wordList);
// 标注中间
iterator = wordLinkedList.iterator();
IWord first = iterator.next();
IWord second = iterator.next();
while (iterator.hasNext()) {
IWord third = iterator.next();
if (first.getLabel().startsWith("ns") && third.getLabel().startsWith("ns") && !second.getLabel().startsWith("ns")) {
second.setLabel(NS.X.toString());
}
first = second;
second = third;
}
if (verbose)
System.out.println("标注中间 " + wordList);
// 拆分地名
CorpusUtil.spilt(wordList);
if (verbose)
System.out.println("拆分地名 " + wordList);
// 处理整个
ListIterator<IWord> listIterator = wordLinkedList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
String label = word.getLabel();
if (label.equals(label.toUpperCase()))
continue;
if (label.startsWith("ns")) {
String value = word.getValue();
int longestSuffixLength = PlaceSuffixDictionary.dictionary.getLongestSuffixLength(value);
int wordLength = value.length() - longestSuffixLength;
if (longestSuffixLength == 0 || wordLength == 0) {
word.setLabel(NS.G.toString());
continue;
}
listIterator.remove();
if (wordLength > 3) {
listIterator.add(new Word(value.substring(0, wordLength), NS.G.toString()));
listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
continue;
}
for (int l = 1, tag = NS.C.ordinal(); l <= wordLength; ++l, ++tag) {
listIterator.add(new Word(value.substring(l - 1, l), NS.values()[tag].toString()));
}
listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
} else {
word.setLabel(NS.Z.toString());
}
}
if (verbose)
System.out.println("处理整个 " + wordList);
}
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class NTDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
int i = 0;
for (List<IWord> wordList : sentenceList) {
Precompiler.compileWithoutNT(wordList);
if (verbose) {
System.out.print(++i + " / " + sentenceList.size() + " ");
System.out.println("原始语料 " + wordList);
}
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
wordLinkedList.addLast(new Word(Predefine.TAG_END, "Z"));
if (verbose)
System.out.println("添加首尾 " + wordList);
// 标注上文
Iterator<IWord> iterator = wordLinkedList.iterator();
IWord pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("nt") && !pre.getLabel().startsWith("nt")) {
pre.setLabel(NT.A.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注上文 " + wordList);
// 标注下文
iterator = wordLinkedList.descendingIterator();
pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("nt") && !pre.getLabel().startsWith("nt")) {
pre.setLabel(NT.B.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注下文 " + wordList);
// 标注中间
{
iterator = wordLinkedList.iterator();
IWord first = iterator.next();
IWord second = iterator.next();
while (iterator.hasNext()) {
IWord third = iterator.next();
if (first.getLabel().startsWith("nt") && third.getLabel().startsWith("nt") && !second.getLabel().startsWith("nt")) {
second.setLabel(NT.X.toString());
}
first = second;
second = third;
}
if (verbose)
System.out.println("标注中间 " + wordList);
}
// 处理整个
ListIterator<IWord> listIterator = wordLinkedList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
String label = word.getLabel();
if (label.equals(label.toUpperCase()))
continue;
if (label.startsWith("nt")) {
StringBuilder sbPattern = new StringBuilder();
// 复杂机构
if (word instanceof CompoundWord) {
listIterator.remove();
Word last = null;
for (Word inner : ((CompoundWord) word).innerList) {
last = inner;
String innerLabel = inner.label;
if (innerLabel.startsWith("ns")) {
inner.setValue(Predefine.TAG_PLACE);
inner.setLabel(NT.G.toString());
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.startsWith("nt")) {
inner.value = Predefine.TAG_GROUP;
inner.label = NT.K.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.equals("b") || innerLabel.equals("ng") || innerLabel.equals("j")) {
inner.label = NT.J.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("n".equals(innerLabel) || "an".equals(innerLabel) || "a".equals(innerLabel) || "vn".equals(innerLabel) || "vd".equals(innerLabel) || "vl".equals(innerLabel) || "v".equals(innerLabel) || "vi".equals(innerLabel) || "nnt".equals(innerLabel) || "nnd".equals(innerLabel) || "nf".equals(innerLabel) || "cc".equals(innerLabel) || "t".equals(innerLabel) || "z".equals(innerLabel)) {
inner.label = NT.C.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("nz".equals(innerLabel)) {
inner.label = NT.I.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("m".equals(innerLabel)) {
inner.value = Predefine.TAG_NUMBER;
inner.label = NT.M.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("w".equals(innerLabel)) {
inner.label = NT.W.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.startsWith("nr") || "x".equals(innerLabel) || "nx".equals(innerLabel)) {
inner.value = Predefine.TAG_PEOPLE;
inner.label = NT.F.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if (innerLabel.startsWith("ni")) {
inner.label = NT.D.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else if ("f".equals(innerLabel) || "s".equals(innerLabel)) {
inner.label = NT.L.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
} else {
inner.label = NT.P.toString();
listIterator.add(inner);
sbPattern.append(inner.label);
}
}
if (last != null) {
last.label = NT.D.toString();
sbPattern.deleteCharAt(sbPattern.length() - 1);
sbPattern.append(last.label);
tfDictionary.add(sbPattern.toString());
sbPattern.setLength(0);
}
} else {
word.setLabel(NT.K.toString());
}
} else {
word.setLabel(NT.Z.toString());
}
}
if (verbose)
System.out.println("处理整个 " + wordList);
wordLinkedList.getFirst().setLabel(NT.S.toString());
}
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class NatureDictionaryMaker method addToDictionary.
@Override
protected void addToDictionary(List<List<IWord>> sentenceList) {
logger.info("开始制作词典");
// 制作NGram词典
for (List<IWord> wordList : sentenceList) {
IWord pre = null;
for (IWord word : wordList) {
// 制作词性词频词典
dictionaryMaker.add(word);
if (pre != null) {
nGramDictionaryMaker.addPair(pre, word);
}
pre = word;
}
}
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class CorpusLoader method convert2SentenceList.
public static List<List<IWord>> convert2SentenceList(String path) {
List<Document> documentList = CorpusLoader.convert2DocumentList(path);
List<List<IWord>> simpleList = new LinkedList<List<IWord>>();
for (Document document : documentList) {
for (Sentence sentence : document.sentenceList) {
simpleList.add(sentence.wordList);
}
}
return simpleList;
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class CharacterBasedGenerativeModel method learn.
/**
* 让模型观测一个句子
* @param wordList
*/
public void learn(List<Word> wordList) {
LinkedList<char[]> sentence = new LinkedList<char[]>();
for (IWord iWord : wordList) {
String word = iWord.getValue();
if (word.length() == 1) {
sentence.add(new char[] { word.charAt(0), 's' });
} else {
sentence.add(new char[] { word.charAt(0), 'b' });
for (int i = 1; i < word.length() - 1; ++i) {
sentence.add(new char[] { word.charAt(i), 'm' });
}
sentence.add(new char[] { word.charAt(word.length() - 1), 'e' });
}
}
// 转换完毕,开始统计
// 定长3的队列
char[][] now = new char[3][];
now[1] = bos;
now[2] = bos;
tf.add(1, bos, bos);
tf.add(2, bos);
for (char[] i : sentence) {
System.arraycopy(now, 1, now, 0, 2);
now[2] = i;
// uni
tf.add(1, i);
// bi
tf.add(1, now[1], now[2]);
// tri
tf.add(1, now);
}
}
Aggregations