use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.
the class PersonDictionary method parsePattern.
/**
* 模式匹配
*
* @param nrList 确定的标注序列
* @param vertexList 原始的未加角色标注的序列
* @param wordNetOptimum 待优化的图
* @param wordNetAll 全词图
*/
public static void parsePattern(List<NR> nrList, List<Vertex> vertexList, final WordNet wordNetOptimum, final WordNet wordNetAll) {
// 拆分UV
ListIterator<Vertex> listIterator = vertexList.listIterator();
StringBuilder sbPattern = new StringBuilder(nrList.size());
NR preNR = NR.A;
boolean backUp = false;
int index = 0;
for (NR nr : nrList) {
++index;
Vertex current = listIterator.next();
// logger.trace("{}/{}", current.realWord, nr);
switch(nr) {
case U:
if (!backUp) {
vertexList = new ArrayList<Vertex>(vertexList);
listIterator = vertexList.listIterator(index);
backUp = true;
}
sbPattern.append(NR.K.toString());
sbPattern.append(NR.B.toString());
preNR = B;
listIterator.previous();
String nowK = current.realWord.substring(0, current.realWord.length() - 1);
String nowB = current.realWord.substring(current.realWord.length() - 1);
listIterator.set(new Vertex(nowK));
listIterator.next();
listIterator.add(new Vertex(nowB));
continue;
case V:
if (!backUp) {
vertexList = new ArrayList<Vertex>(vertexList);
listIterator = vertexList.listIterator(index);
backUp = true;
}
if (preNR == B) {
//BE
sbPattern.append(NR.E.toString());
} else {
//CD
sbPattern.append(NR.D.toString());
}
sbPattern.append(NR.L.toString());
// 对串也做一些修改
listIterator.previous();
String nowED = current.realWord.substring(current.realWord.length() - 1);
String nowL = current.realWord.substring(0, current.realWord.length() - 1);
listIterator.set(new Vertex(nowED));
listIterator.add(new Vertex(nowL));
listIterator.next();
continue;
default:
sbPattern.append(nr.toString());
break;
}
preNR = nr;
}
String pattern = sbPattern.toString();
// logger.trace("模式串:{}", pattern);
// logger.trace("对应串:{}", vertexList);
// if (pattern.length() != vertexList.size())
// {
// logger.warn("人名识别模式串有bug", pattern, vertexList);
// return;
// }
final Vertex[] wordArray = vertexList.toArray(new Vertex[0]);
final int[] offsetArray = new int[wordArray.length];
offsetArray[0] = 0;
for (int i = 1; i < wordArray.length; ++i) {
offsetArray[i] = offsetArray[i - 1] + wordArray[i - 1].realWord.length();
}
trie.parseText(pattern, new AhoCorasickDoubleArrayTrie.IHit<NRPattern>() {
@Override
public void hit(int begin, int end, NRPattern value) {
// logger.trace("匹配到:{}", keyword);
StringBuilder sbName = new StringBuilder();
for (int i = begin; i < end; ++i) {
sbName.append(wordArray[i].realWord);
}
String name = sbName.toString();
// 对一些bad case做出调整
switch(value) {
case BCD:
// 姓和最后一个名不可能相等的
if (name.charAt(0) == name.charAt(2))
return;
// }
break;
}
if (isBadCase(name))
return;
// 正式算它是一个名字
if (HanLP.Config.DEBUG) {
System.out.printf("识别出人名:%s %s\n", name, value);
}
int offset = offsetArray[begin];
wordNetOptimum.insert(offset, new Vertex(Predefine.TAG_PEOPLE, name, ATTRIBUTE, WORD_ID), wordNetAll);
}
});
}
use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.
the class PersonRecognition method roleObserve.
/**
* 角色观察(从模型中加载所有词语对应的所有角色,允许进行一些规则补充)
* @param wordSegResult 粗分结果
* @return
*/
public static List<EnumItem<NR>> roleObserve(List<Vertex> wordSegResult) {
List<EnumItem<NR>> tagList = new LinkedList<EnumItem<NR>>();
for (Vertex vertex : wordSegResult) {
EnumItem<NR> nrEnumItem = PersonDictionary.dictionary.get(vertex.realWord);
if (nrEnumItem == null) {
switch(vertex.guessNature()) {
case nr:
{
// 有些双名实际上可以构成更长的三名
if (vertex.getAttribute().totalFrequency <= 1000 && vertex.realWord.length() == 2) {
nrEnumItem = new EnumItem<NR>(NR.X, NR.G);
} else
nrEnumItem = new EnumItem<NR>(NR.A, PersonDictionary.transformMatrixDictionary.getTotalFrequency(NR.A));
}
break;
case nnt:
{
// 姓+职位
nrEnumItem = new EnumItem<NR>(NR.G, NR.K);
}
break;
default:
{
nrEnumItem = new EnumItem<NR>(NR.A, PersonDictionary.transformMatrixDictionary.getTotalFrequency(NR.A));
}
break;
}
}
tagList.add(nrEnumItem);
}
return tagList;
}
use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.
the class PersonRecognition method Recognition.
public static boolean Recognition(List<Vertex> pWordSegResult, WordNet wordNetOptimum, WordNet wordNetAll) {
List<EnumItem<NR>> roleTagList = roleObserve(pWordSegResult);
if (HanLP.Config.DEBUG) {
StringBuilder sbLog = new StringBuilder();
Iterator<Vertex> iterator = pWordSegResult.iterator();
for (EnumItem<NR> nrEnumItem : roleTagList) {
sbLog.append('[');
sbLog.append(iterator.next().realWord);
sbLog.append(' ');
sbLog.append(nrEnumItem);
sbLog.append(']');
}
System.out.printf("人名角色观察:%s\n", sbLog.toString());
}
List<NR> nrList = viterbiComputeSimply(roleTagList);
if (HanLP.Config.DEBUG) {
StringBuilder sbLog = new StringBuilder();
Iterator<Vertex> iterator = pWordSegResult.iterator();
sbLog.append('[');
for (NR nr : nrList) {
sbLog.append(iterator.next().realWord);
sbLog.append('/');
sbLog.append(nr);
sbLog.append(" ,");
}
if (sbLog.length() > 1)
sbLog.delete(sbLog.length() - 2, sbLog.length());
sbLog.append(']');
System.out.printf("人名角色标注:%s\n", sbLog.toString());
}
PersonDictionary.parsePattern(nrList, pWordSegResult, wordNetOptimum, wordNetAll);
return true;
}
use of com.hankcs.hanlp.corpus.tag.NR in project HanLP by hankcs.
the class NRDictionary method loadDat.
private EnumItem<NR>[] loadDat(String path) {
byte[] bytes = IOUtil.readBytes(path);
if (bytes == null)
return null;
NR[] nrArray = NR.values();
int index = 0;
int size = ByteUtil.bytesHighFirstToInt(bytes, index);
index += 4;
EnumItem<NR>[] valueArray = new EnumItem[size];
for (int i = 0; i < size; ++i) {
int currentSize = ByteUtil.bytesHighFirstToInt(bytes, index);
index += 4;
EnumItem<NR> item = new EnumItem<NR>();
for (int j = 0; j < currentSize; ++j) {
NR nr = nrArray[ByteUtil.bytesHighFirstToInt(bytes, index)];
index += 4;
int frequency = ByteUtil.bytesHighFirstToInt(bytes, index);
index += 4;
item.labelMap.put(nr, frequency);
}
valueArray[i] = item;
}
return valueArray;
}
Aggregations