use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class WordBasedGenerativeModelSegment method convert.
/**
* 将一条路径转为最终结果
*
* @param vertexList
* @param offsetEnabled 是否计算offset
* @return
*/
protected static List<Term> convert(List<Vertex> vertexList, boolean offsetEnabled) {
assert vertexList != null;
assert vertexList.size() >= 2 : "这条路径不应当短于2" + vertexList.toString();
int length = vertexList.size() - 2;
List<Term> resultList = new ArrayList<Term>(length);
Iterator<Vertex> iterator = vertexList.iterator();
iterator.next();
if (offsetEnabled) {
int offset = 0;
for (int i = 0; i < length; ++i) {
Vertex vertex = iterator.next();
Term term = convert(vertex);
term.offset = offset;
offset += term.length();
resultList.add(term);
}
} else {
for (int i = 0; i < length; ++i) {
Vertex vertex = iterator.next();
Term term = convert(vertex);
resultList.add(term);
}
}
return resultList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class WordBasedGenerativeModelSegment method SplitMiddleSlashFromDigitalWords.
//====================================================================
//如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
//那么将此“-”符号从当前词中分离出来。
//例如 “3-4 / 月”需要拆分成“3 / - / 4 / 月”
//====================================================================
private static void SplitMiddleSlashFromDigitalWords(List<Vertex> linkedArray) {
if (linkedArray.size() < 2)
return;
ListIterator<Vertex> listIterator = linkedArray.listIterator();
Vertex next = listIterator.next();
Vertex current = next;
while (listIterator.hasNext()) {
next = listIterator.next();
// System.out.println("current:" + current + " next:" + next);
Nature currentNature = current.getNature();
if (currentNature == Nature.nx && (next.hasNature(Nature.q) || next.hasNature(Nature.n))) {
String[] param = current.realWord.split("-", 1);
if (param.length == 2) {
if (TextUtility.isAllNum(param[0]) && TextUtility.isAllNum(param[1])) {
current = current.copy();
current.realWord = param[0];
current.confirmNature(Nature.m);
listIterator.previous();
listIterator.previous();
listIterator.set(current);
listIterator.next();
listIterator.add(Vertex.newPunctuationInstance("-"));
listIterator.add(Vertex.newNumberInstance(param[1]));
}
}
}
current = next;
}
// logger.trace("杠号识别后:" + Graph.parseResult(linkedArray));
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class WordBasedGenerativeModelSegment method decorateResultForIndexMode.
/**
* 为了索引模式修饰结果
*
* @param vertexList
* @param wordNetAll
*/
protected static List<Term> decorateResultForIndexMode(List<Vertex> vertexList, WordNet wordNetAll) {
List<Term> termList = new LinkedList<Term>();
int line = 1;
ListIterator<Vertex> listIterator = vertexList.listIterator();
listIterator.next();
int length = vertexList.size() - 2;
for (int i = 0; i < length; ++i) {
Vertex vertex = listIterator.next();
Term termMain = convert(vertex);
termList.add(termMain);
termMain.offset = line - 1;
if (vertex.realWord.length() > 2) {
// 过长词所在的行
int currentLine = line;
while (currentLine < line + vertex.realWord.length()) {
// 这一行的词
List<Vertex> vertexListCurrentLine = wordNetAll.get(currentLine);
for (// 这一行的短词
Vertex smallVertex : // 这一行的短词
vertexListCurrentLine) {
if (((termMain.nature == Nature.mq && smallVertex.hasNature(Nature.q)) || smallVertex.realWord.length() > 1) && smallVertex != vertex) {
listIterator.add(smallVertex);
Term termSub = convert(smallVertex);
termSub.offset = currentLine - 1;
termList.add(termSub);
}
}
++currentLine;
}
}
line += vertex.realWord.length();
}
return termList;
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class WordBasedGenerativeModelSegment method GenerateWordNet.
/**
* 生成一元词网
*
* @param wordNetStorage
*/
protected void GenerateWordNet(final WordNet wordNetStorage) {
final char[] charArray = wordNetStorage.charArray;
// 核心词典查询
DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(charArray, 0);
while (searcher.next()) {
wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value, searcher.index));
}
// 用户词典查询
// if (config.useCustomDictionary)
// {
// searcher = CustomDictionary.dat.getSearcher(charArray, 0);
// while (searcher.next())
// {
// wordNetStorage.add(searcher.begin + 1, new Vertex(new String(charArray, searcher.begin, searcher.length), searcher.value));
// }
// }
// 原子分词,保证图连通
LinkedList<Vertex>[] vertexes = wordNetStorage.getVertexes();
for (int i = 1; i < vertexes.length; ) {
if (vertexes[i].isEmpty()) {
int j = i + 1;
for (; j < vertexes.length - 1; ++j) {
if (!vertexes[j].isEmpty())
break;
}
wordNetStorage.add(i, quickAtomSegment(charArray, i - 1, j - 1));
i = j;
} else
i += vertexes[i].getLast().realWord.length();
}
}
use of com.hankcs.hanlp.seg.common.Vertex in project HanLP by hankcs.
the class Segment method mergeNumberQuantifier.
/**
* 合并数字
* @param termList
*/
protected void mergeNumberQuantifier(List<Vertex> termList, WordNet wordNetAll, Config config) {
if (termList.size() < 4)
return;
StringBuilder sbQuantifier = new StringBuilder();
ListIterator<Vertex> iterator = termList.listIterator();
iterator.next();
int line = 1;
while (iterator.hasNext()) {
Vertex pre = iterator.next();
if (pre.hasNature(Nature.m)) {
sbQuantifier.append(pre.realWord);
Vertex cur = null;
while (iterator.hasNext() && (cur = iterator.next()).hasNature(Nature.m)) {
sbQuantifier.append(cur.realWord);
iterator.remove();
removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length());
}
if (cur != null) {
if ((cur.hasNature(Nature.q) || cur.hasNature(Nature.qv) || cur.hasNature(Nature.qt))) {
if (config.indexMode) {
wordNetAll.add(line, new Vertex(sbQuantifier.toString(), new CoreDictionary.Attribute(Nature.m)));
}
sbQuantifier.append(cur.realWord);
iterator.remove();
removeFromWordNet(cur, wordNetAll, line, sbQuantifier.length());
} else {
// (cur = iterator.next()).hasNature(Nature.m) 最后一个next可能不含q词性
line += cur.realWord.length();
}
}
if (sbQuantifier.length() != pre.realWord.length()) {
pre.realWord = sbQuantifier.toString();
pre.word = Predefine.TAG_NUMBER;
pre.attribute = new CoreDictionary.Attribute(Nature.mq);
pre.wordID = CoreDictionary.M_WORD_ID;
sbQuantifier.setLength(0);
}
}
sbQuantifier.setLength(0);
line += pre.realWord.length();
}
// System.out.println(wordNetAll);
}
Aggregations