use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method combineWords.
/**
* 将连续的词语合并为一个
* @param wordNet 词图
* @param start 起始下标(包含)
* @param end 结束下标(不包含)
* @param value 新的属性
*/
private static void combineWords(Vertex[] wordNet, int start, int end, CoreDictionary.Attribute value) {
if (// 小优化,如果只有一个词,那就不需要合并,直接应用新属性
start + 1 == end) {
wordNet[start].attribute = value;
} else {
StringBuilder sbTerm = new StringBuilder();
for (int j = start; j < end; ++j) {
if (wordNet[j] == null)
continue;
String realWord = wordNet[j].realWord;
sbTerm.append(realWord);
wordNet[j] = null;
}
wordNet[start] = new Vertex(sbTerm.toString(), value);
}
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class ViterbiSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
// long start = System.currentTimeMillis();
WordNet wordNetAll = new WordNet(sentence);
////////////////生成词网////////////////////
GenerateWordNet(wordNetAll);
// System.out.println("构图:" + (System.currentTimeMillis() - start));
if (HanLP.Config.DEBUG) {
System.out.printf("粗分词网:\n%s\n", wordNetAll);
}
// start = System.currentTimeMillis();
List<Vertex> vertexList = viterbi(wordNetAll);
if (config.useCustomDictionary) {
if (config.indexMode)
combineByCustomDictionary(vertexList, wordNetAll);
else
combineByCustomDictionary(vertexList);
}
if (HanLP.Config.DEBUG) {
System.out.println("粗分结果" + convert(vertexList, false));
}
// 数字识别
if (config.numberQuantifierRecognize) {
mergeNumberQuantifier(vertexList, wordNetAll, config);
}
// 实体命名识别
if (config.ner) {
WordNet wordNetOptimum = new WordNet(sentence, vertexList);
int preSize = wordNetOptimum.size();
if (config.nameRecognize) {
PersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.translatedNameRecognize) {
TranslatedPersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.japaneseNameRecognize) {
JapanesePersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.placeRecognize) {
PlaceRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.organizationRecognize) {
// 层叠隐马模型——生成输出作为下一级隐马输入
vertexList = viterbi(wordNetOptimum);
wordNetOptimum.clear();
wordNetOptimum.addAll(vertexList);
preSize = wordNetOptimum.size();
OrganizationRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (wordNetOptimum.size() != preSize) {
vertexList = viterbi(wordNetOptimum);
if (HanLP.Config.DEBUG) {
System.out.printf("细分词网:\n%s\n", wordNetOptimum);
}
}
}
// 如果是索引模式则全切分
if (config.indexMode) {
return decorateResultForIndexMode(vertexList, wordNetAll);
}
// 是否标注词性
if (config.speechTagging) {
speechTagging(vertexList);
}
return convert(vertexList, config.offset);
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class WordBasedGenerativeModelSegment method mergeContinueNumIntoOne.
/**
* 将连续的数字节点合并为一个
*
* @param linkedArray
*/
private static void mergeContinueNumIntoOne(List<Vertex> linkedArray) {
if (linkedArray.size() < 2)
return;
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = listIterator.next();
Vertex current = next;
while (listIterator.hasNext()) {
next = listIterator.next();
// System.out.println("current:" + current + " next:" + next);
if ((TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) && (TextUtility.isAllNum(next.realWord) || TextUtility.isAllChineseNum(next.realWord))) {
/////////// 这部分从逻辑上等同于current.realWord = current.realWord + next.realWord;
// 但是current指针被几个路径共享,需要备份,不然修改了一处就修改了全局
current = Vertex.newNumberInstance(current.realWord + next.realWord);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.next();
/////////// end 这部分
// System.out.println("before:" + linkedArray);
listIterator.remove();
// System.out.println("after:" + linkedArray);
} else {
current = next;
}
}
// logger.trace("数字识别后:" + Graph.parseResult(linkedArray));
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class WordBasedGenerativeModelSegment method CheckDateElements.
//====================================================================
//1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
//2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
//3、如果最后一个汉字是"点" ,则认为当前数字是时间
//4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
//5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
//====================================================================
private static void CheckDateElements(List<Vertex> linkedArray) {
if (linkedArray.size() < 2)
return;
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = listIterator.next();
Vertex current = next;
while (listIterator.hasNext()) {
next = listIterator.next();
if (TextUtility.isAllNum(current.realWord) || TextUtility.isAllChineseNum(current.realWord)) {
//===== 1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
String nextWord = next.realWord;
if ((nextWord.length() == 1 && "月日时分秒".contains(nextWord)) || (nextWord.length() == 2 && nextWord.equals("月份"))) {
current = Vertex.newTimeInstance(current.realWord + next.realWord);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.next();
listIterator.remove();
} else //===== 2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
if (nextWord.equals("年")) {
if (TextUtility.isYearTime(current.realWord)) {
current = Vertex.newTimeInstance(current.realWord + next.realWord);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.next();
listIterator.remove();
} else //===== 否则当前词就是数字了 =====
{
current.confirmNature(Nature.m);
}
} else {
//===== 3、如果最后一个汉字是"点" ,则认为当前数字是时间
if (current.realWord.endsWith("点")) {
current.confirmNature(Nature.t, true);
} else {
char[] tmpCharArray = current.realWord.toCharArray();
String lastChar = String.valueOf(tmpCharArray[tmpCharArray.length - 1]);
//===== 4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
if (!"∶·././".contains(lastChar)) {
current.confirmNature(Nature.m, true);
} else //===== 5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
if (current.realWord.length() > 1) {
char last = current.realWord.charAt(current.realWord.length() - 1);
current = Vertex.newNumberInstance(current.realWord.substring(0, current.realWord.length() - 1));
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.add(Vertex.newPunctuationInstance(String.valueOf(last)));
}
}
}
}
current = next;
}
// logger.trace("日期识别后:" + Graph.parseResult(linkedArray));
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class NShortSegment method segSentence.
@Override
public List<Term> segSentence(char[] sentence) {
WordNet wordNetOptimum = new WordNet(sentence);
WordNet wordNetAll = new WordNet(sentence);
// char[] charArray = text.toCharArray();
// 粗分
List<List<Vertex>> coarseResult = BiSegment(sentence, 2, wordNetOptimum, wordNetAll);
boolean NERexists = false;
for (List<Vertex> vertexList : coarseResult) {
if (HanLP.Config.DEBUG) {
System.out.println("粗分结果" + convert(vertexList, false));
}
// 实体命名识别
if (config.ner) {
wordNetOptimum.addAll(vertexList);
int preSize = wordNetOptimum.size();
if (config.nameRecognize) {
PersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.translatedNameRecognize) {
TranslatedPersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.japaneseNameRecognize) {
JapanesePersonRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.placeRecognize) {
PlaceRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (config.organizationRecognize) {
// 层叠隐马模型——生成输出作为下一级隐马输入
vertexList = Dijkstra.compute(GenerateBiGraph(wordNetOptimum));
wordNetOptimum.addAll(vertexList);
OrganizationRecognition.Recognition(vertexList, wordNetOptimum, wordNetAll);
}
if (!NERexists && preSize != wordNetOptimum.size()) {
NERexists = true;
}
}
}
List<Vertex> vertexList = coarseResult.get(0);
if (NERexists) {
Graph graph = GenerateBiGraph(wordNetOptimum);
vertexList = Dijkstra.compute(graph);
if (HanLP.Config.DEBUG) {
System.out.printf("细分词网:\n%s\n", wordNetOptimum);
System.out.printf("细分词图:%s\n", graph.printByTo());
}
}
// 数字识别
if (config.numberQuantifierRecognize) {
mergeNumberQuantifier(vertexList, wordNetAll, config);
}
// 如果是索引模式则全切分
if (config.indexMode) {
return decorateResultForIndexMode(vertexList, wordNetAll);
}
// 是否标注词性
if (config.speechTagging) {
speechTagging(vertexList);
}
if (config.useCustomDictionary) {
if (config.indexMode)
combineByCustomDictionary(vertexList, wordNetAll);
else
combineByCustomDictionary(vertexList);
}
return convert(vertexList, config.offset);
}
Aggregations