use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class Viterbi method compute.
/**
* 特化版的求解HMM模型
*
* @param vertexList 包含Vertex.B节点的路径
* @param transformMatrixDictionary 词典对应的转移矩阵
*/
public static void compute(List<Vertex> vertexList, TransformMatrixDictionary<Nature> transformMatrixDictionary) {
int length = vertexList.size() - 1;
// 滚动数组
double[][] cost = new double[2][];
Iterator<Vertex> iterator = vertexList.iterator();
Vertex start = iterator.next();
Nature pre = start.attribute.nature[0];
// 第一个是确定的
// start.confirmNature(pre);
// 第二个也可以简单地算出来
Vertex preItem;
Nature[] preTagSet;
{
Vertex item = iterator.next();
cost[0] = new double[item.attribute.nature.length];
int j = 0;
int curIndex = 0;
for (Nature cur : item.attribute.nature) {
cost[0][j] = transformMatrixDictionary.transititon_probability[pre.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[curIndex] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
++j;
++curIndex;
}
preTagSet = item.attribute.nature;
preItem = item;
}
// 第三个开始复杂一些
for (int i = 1; i < length; ++i) {
int index_i = i & 1;
int index_i_1 = 1 - index_i;
Vertex item = iterator.next();
cost[index_i] = new double[item.attribute.nature.length];
double perfect_cost_line = Double.MAX_VALUE;
int k = 0;
Nature[] curTagSet = item.attribute.nature;
for (Nature cur : curTagSet) {
cost[index_i][k] = Double.MAX_VALUE;
int j = 0;
for (Nature p : preTagSet) {
double now = cost[index_i_1][j] + transformMatrixDictionary.transititon_probability[p.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[k] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
if (now < cost[index_i][k]) {
cost[index_i][k] = now;
if (now < perfect_cost_line) {
perfect_cost_line = now;
pre = p;
}
}
++j;
}
++k;
}
preItem.confirmNature(pre);
preTagSet = curTagSet;
preItem = item;
}
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class CustomDictionary method loadDat.
/**
* 从磁盘加载双数组
*
* @param path
* @return
*/
static boolean loadDat(String path) {
try {
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
if (byteArray == null)
return false;
int size = byteArray.nextInt();
if (// 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性
size < 0) {
while (++size <= 0) {
Nature.create(byteArray.nextString());
}
size = byteArray.nextInt();
}
CoreDictionary.Attribute[] attributes = new CoreDictionary.Attribute[size];
final Nature[] natureIndexArray = Nature.values();
for (int i = 0; i < size; ++i) {
// 第一个是全部频次,第二个是词性个数
int currentTotalFrequency = byteArray.nextInt();
int length = byteArray.nextInt();
attributes[i] = new CoreDictionary.Attribute(length);
attributes[i].totalFrequency = currentTotalFrequency;
for (int j = 0; j < length; ++j) {
attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()];
attributes[i].frequency[j] = byteArray.nextInt();
}
}
if (!dat.load(byteArray, attributes))
return false;
} catch (Exception e) {
logger.warning("读取失败,问题发生在" + TextUtility.exceptionToString(e));
return false;
}
return true;
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class CustomDictionary method loadMainDictionary.
private static boolean loadMainDictionary(String mainPath) {
logger.info("自定义词典开始加载:" + mainPath);
if (loadDat(mainPath))
return true;
dat = new DoubleArrayTrie<CoreDictionary.Attribute>();
TreeMap<String, CoreDictionary.Attribute> map = new TreeMap<String, CoreDictionary.Attribute>();
LinkedHashSet<Nature> customNatureCollector = new LinkedHashSet<Nature>();
try {
for (String p : path) {
Nature defaultNature = Nature.n;
int cut = p.indexOf(' ');
if (cut > 0) {
// 有默认词性
String nature = p.substring(cut + 1);
p = p.substring(0, cut);
try {
defaultNature = LexiconUtility.convertStringToNature(nature, customNatureCollector);
} catch (Exception e) {
logger.severe("配置文件【" + p + "】写错了!" + e);
continue;
}
}
logger.info("以默认词性[" + defaultNature + "]加载自定义词典" + p + "中……");
boolean success = load(p, defaultNature, map, customNatureCollector);
if (!success)
logger.warning("失败:" + p);
}
if (map.size() == 0) {
logger.warning("没有加载到任何词条");
// 当作空白占位符
map.put(Predefine.TAG_OTHER, null);
}
logger.info("正在构建DoubleArrayTrie……");
dat.build(map);
// 缓存成dat文件,下次加载会快很多
logger.info("正在缓存词典为dat文件……");
// 缓存值文件
List<CoreDictionary.Attribute> attributeList = new LinkedList<CoreDictionary.Attribute>();
for (Map.Entry<String, CoreDictionary.Attribute> entry : map.entrySet()) {
attributeList.add(entry.getValue());
}
DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(mainPath + Predefine.BIN_EXT));
// 缓存用户词性
IOUtil.writeCustomNature(out, customNatureCollector);
// 缓存正文
out.writeInt(attributeList.size());
for (CoreDictionary.Attribute attribute : attributeList) {
attribute.save(out);
}
dat.save(out);
out.close();
} catch (FileNotFoundException e) {
logger.severe("自定义词典" + mainPath + "不存在!" + e);
return false;
} catch (IOException e) {
logger.severe("自定义词典" + mainPath + "读取错误!" + e);
return false;
} catch (Exception e) {
logger.warning("自定义词典" + mainPath + "缓存失败!\n" + TextUtility.exceptionToString(e));
}
return true;
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class OrganizationRecognition method roleTag.
public static List<EnumItem<NT>> roleTag(List<Vertex> vertexList, WordNet wordNetAll) {
List<EnumItem<NT>> tagList = new LinkedList<EnumItem<NT>>();
// int line = 0;
for (Vertex vertex : vertexList) {
// 构成更长的
Nature nature = vertex.guessNature();
switch(nature) {
case nrf:
{
if (vertex.getAttribute().totalFrequency <= 1000) {
tagList.add(new EnumItem<NT>(NT.F, 1000));
} else
break;
}
continue;
case ni:
case nic:
case nis:
case nit:
{
EnumItem<NT> ntEnumItem = new EnumItem<NT>(NT.K, 1000);
ntEnumItem.addLabel(NT.D, 1000);
tagList.add(ntEnumItem);
}
continue;
case m:
{
EnumItem<NT> ntEnumItem = new EnumItem<NT>(NT.M, 1000);
tagList.add(ntEnumItem);
}
continue;
}
// 此处用等效词,更加精准
EnumItem<NT> NTEnumItem = OrganizationDictionary.dictionary.get(vertex.word);
if (NTEnumItem == null) {
NTEnumItem = new EnumItem<NT>(NT.Z, OrganizationDictionary.transformMatrixDictionary.getTotalFrequency(NT.Z));
}
tagList.add(NTEnumItem);
// line += vertex.realWord.length();
}
return tagList;
}
use of com.hankcs.hanlp.corpus.tag.Nature in project HanLP by hankcs.
the class AtomNode method getNature.
/**
* 原子的词性
* @return
*/
public Nature getNature() {
Nature nature = Nature.nz;
switch(nPOS) {
case Predefine.CT_CHINESE:
break;
case Predefine.CT_INDEX:
case Predefine.CT_NUM:
nature = Nature.m;
sWord = "未##数";
break;
case Predefine.CT_DELIMITER:
nature = Nature.w;
break;
case Predefine.CT_LETTER:
nature = Nature.nx;
sWord = "未##串";
break;
case //12021-2129-3121
Predefine.CT_SINGLE:
if (//匹配浮点数
Predefine.PATTERN_FLOAT_NUMBER.matcher(sWord).matches()) {
nature = Nature.m;
sWord = "未##数";
} else {
nature = Nature.nx;
sWord = "未##串";
}
break;
default:
break;
}
return nature;
}
Aggregations