use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.
the class PlaceDictionary method parsePattern.
/**
* 模式匹配
*
* @param nsList 确定的标注序列
* @param vertexList 原始的未加角色标注的序列
* @param wordNetOptimum 待优化的图
* @param wordNetAll
*/
public static void parsePattern(List<NS> nsList, List<Vertex> vertexList, final WordNet wordNetOptimum, final WordNet wordNetAll) {
// ListIterator<Vertex> listIterator = vertexList.listIterator();
StringBuilder sbPattern = new StringBuilder(nsList.size());
for (NS ns : nsList) {
sbPattern.append(ns.toString());
}
String pattern = sbPattern.toString();
final Vertex[] wordArray = vertexList.toArray(new Vertex[0]);
trie.parseText(pattern, new AhoCorasickDoubleArrayTrie.IHit<String>() {
@Override
public void hit(int begin, int end, String value) {
StringBuilder sbName = new StringBuilder();
for (int i = begin; i < end; ++i) {
sbName.append(wordArray[i].realWord);
}
String name = sbName.toString();
// 对一些bad case做出调整
if (isBadCase(name))
return;
// 正式算它是一个名字
if (HanLP.Config.DEBUG) {
System.out.printf("识别出地名:%s %s\n", name, value);
}
int offset = 0;
for (int i = 0; i < begin; ++i) {
offset += wordArray[i].realWord.length();
}
wordNetOptimum.insert(offset, new Vertex(Predefine.TAG_PLACE, name, ATTRIBUTE, WORD_ID), wordNetAll);
}
});
}
use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.
the class PlaceRecognition method Recognition.
public static boolean Recognition(List<Vertex> pWordSegResult, WordNet wordNetOptimum, WordNet wordNetAll) {
List<EnumItem<NS>> roleTagList = roleTag(pWordSegResult, wordNetAll);
if (HanLP.Config.DEBUG) {
StringBuilder sbLog = new StringBuilder();
Iterator<Vertex> iterator = pWordSegResult.iterator();
for (EnumItem<NS> NSEnumItem : roleTagList) {
sbLog.append('[');
sbLog.append(iterator.next().realWord);
sbLog.append(' ');
sbLog.append(NSEnumItem);
sbLog.append(']');
}
System.out.printf("地名角色观察:%s\n", sbLog.toString());
}
List<NS> NSList = viterbiExCompute(roleTagList);
if (HanLP.Config.DEBUG) {
StringBuilder sbLog = new StringBuilder();
Iterator<Vertex> iterator = pWordSegResult.iterator();
sbLog.append('[');
for (NS NS : NSList) {
sbLog.append(iterator.next().realWord);
sbLog.append('/');
sbLog.append(NS);
sbLog.append(" ,");
}
if (sbLog.length() > 1)
sbLog.delete(sbLog.length() - 2, sbLog.length());
sbLog.append(']');
System.out.printf("地名角色标注:%s\n", sbLog.toString());
}
PlaceDictionary.parsePattern(NSList, pWordSegResult, wordNetOptimum, wordNetAll);
return true;
}
use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.
the class PlaceRecognition method roleTag.
public static List<EnumItem<NS>> roleTag(List<Vertex> vertexList, WordNet wordNetAll) {
List<EnumItem<NS>> tagList = new LinkedList<EnumItem<NS>>();
ListIterator<Vertex> listIterator = vertexList.listIterator();
// int line = 0;
while (listIterator.hasNext()) {
Vertex vertex = listIterator.next();
// }
if (Nature.ns == vertex.getNature() && vertex.getAttribute().totalFrequency <= 1000) {
if (// 二字地名,认为其可以再接一个后缀或前缀
vertex.realWord.length() < 3)
tagList.add(new EnumItem<NS>(NS.H, NS.G));
else
// 否则只可以再加后缀
tagList.add(new EnumItem<NS>(NS.G));
continue;
}
// 此处用等效词,更加精准
EnumItem<NS> NSEnumItem = PlaceDictionary.dictionary.get(vertex.word);
if (NSEnumItem == null) {
NSEnumItem = new EnumItem<NS>(NS.Z, PlaceDictionary.transformMatrixDictionary.getTotalFrequency(NS.Z));
}
tagList.add(NSEnumItem);
// line += vertex.realWord.length();
}
return tagList;
}
use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.
the class PlaceRecognition method insert.
private static void insert(ListIterator<Vertex> listIterator, List<EnumItem<NS>> tagList, WordNet wordNetAll, int line, NS ns) {
Vertex vertex = wordNetAll.getFirst(line);
assert vertex != null : "全词网居然有空白行!";
listIterator.add(vertex);
tagList.add(new EnumItem<NS>(ns, 1000));
}
use of com.hankcs.hanlp.corpus.tag.NS in project HanLP by hankcs.
the class NSDictionary method loadDat.
private EnumItem<NS>[] loadDat(String path) {
byte[] bytes = IOUtil.readBytes(path);
if (bytes == null)
return null;
NS[] NSArray = NS.values();
int index = 0;
int size = ByteUtil.bytesHighFirstToInt(bytes, index);
index += 4;
EnumItem<NS>[] valueArray = new EnumItem[size];
for (int i = 0; i < size; ++i) {
int currentSize = ByteUtil.bytesHighFirstToInt(bytes, index);
index += 4;
EnumItem<NS> item = new EnumItem<NS>();
for (int j = 0; j < currentSize; ++j) {
NS NS = NSArray[ByteUtil.bytesHighFirstToInt(bytes, index)];
index += 4;
int frequency = ByteUtil.bytesHighFirstToInt(bytes, index);
index += 4;
item.labelMap.put(NS, frequency);
}
valueArray[i] = item;
}
return valueArray;
}
Aggregations