use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.
the class AhoCorasickDoubleArrayTrieSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
if (trie == null) {
logger.warning("还未加载任何词典");
return Collections.emptyList();
}
final int[] wordNet = new int[sentence.length];
Arrays.fill(wordNet, 1);
final Nature[] natureArray = config.speechTagging ? new Nature[sentence.length] : null;
trie.parseText(sentence, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value) {
int length = end - begin;
if (length > wordNet[begin]) {
wordNet[begin] = length;
if (config.speechTagging) {
natureArray[begin] = value.nature[0];
}
}
}
});
LinkedList<Term> termList = new LinkedList<Term>();
if (config.speechTagging) {
for (int i = 0; i < natureArray.length; ) {
if (natureArray[i] == null) {
int j = i + 1;
for (; j < natureArray.length; ++j) {
if (natureArray[j] != null)
break;
}
List<AtomNode> atomNodeList = quickAtomSegment(sentence, i, j);
for (AtomNode atomNode : atomNodeList) {
if (atomNode.sWord.length() >= wordNet[i]) {
wordNet[i] = atomNode.sWord.length();
natureArray[i] = atomNode.getNature();
i += wordNet[i];
}
}
i = j;
} else {
++i;
}
}
}
for (int i = 0; i < wordNet.length; ) {
Term term = new Term(new String(sentence, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
term.offset = i;
termList.add(term);
i += wordNet[i];
}
return termList;
}
use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.
the class Segment method quickAtomSegment.
/**
* 快速原子分词,希望用这个方法替换掉原来缓慢的方法
*
* @param charArray
* @param start
* @param end
* @return
*/
protected static List<AtomNode> quickAtomSegment(char[] charArray, int start, int end) {
List<AtomNode> atomNodeList = new LinkedList<AtomNode>();
int offsetAtom = start;
int preType = CharType.get(charArray[offsetAtom]);
int curType;
while (++offsetAtom < end) {
curType = CharType.get(charArray[offsetAtom]);
if (curType != preType) {
// 浮点数识别
if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM) {
while (++offsetAtom < end) {
curType = CharType.get(charArray[offsetAtom]);
if (curType != CharType.CT_NUM)
break;
}
}
atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
start = offsetAtom;
}
preType = curType;
}
if (offsetAtom == end)
atomNodeList.add(new AtomNode(new String(charArray, start, offsetAtom - start), preType));
return atomNodeList;
}
use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.
the class WordNet method add.
/**
* 添加顶点,由原子分词顶点添加
*
* @param line
* @param atomSegment
*/
public void add(int line, List<AtomNode> atomSegment) {
// 将原子部分存入m_segGraph
int offset = 0;
for (//Init the cost array
AtomNode atomNode : //Init the cost array
atomSegment) {
//init the word
String sWord = atomNode.sWord;
Nature nature = Nature.n;
int id = -1;
switch(atomNode.nPOS) {
case Predefine.CT_CHINESE:
break;
case Predefine.CT_INDEX:
case Predefine.CT_NUM:
nature = Nature.m;
sWord = "未##数";
id = CoreDictionary.M_WORD_ID;
break;
case Predefine.CT_DELIMITER:
case Predefine.CT_OTHER:
nature = Nature.w;
break;
case //12021-2129-3121
Predefine.CT_SINGLE:
nature = Nature.nx;
sWord = "未##串";
id = CoreDictionary.X_WORD_ID;
break;
default:
break;
}
// 这些通用符的量级都在10万左右
add(line + offset, new Vertex(sWord, atomNode.sWord, new CoreDictionary.Attribute(nature, 10000), id));
offset += atomNode.sWord.length();
}
}
use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.
the class DoubleArrayTrieSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
char[] charArray = sentence;
final int[] wordNet = new int[charArray.length];
Arrays.fill(wordNet, 1);
final Nature[] natureArray = config.speechTagging ? new Nature[charArray.length] : null;
DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(sentence, 0);
while (searcher.next()) {
int length = searcher.length;
if (length > wordNet[searcher.begin]) {
wordNet[searcher.begin] = length;
if (config.speechTagging) {
natureArray[searcher.begin] = searcher.value.nature[0];
}
}
}
if (config.useCustomDictionary) {
CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value) {
int length = end - begin;
if (length > wordNet[begin]) {
wordNet[begin] = length;
if (config.speechTagging) {
natureArray[begin] = value.nature[0];
}
}
}
});
}
LinkedList<Term> termList = new LinkedList<Term>();
if (config.speechTagging) {
for (int i = 0; i < natureArray.length; ) {
if (natureArray[i] == null) {
int j = i + 1;
for (; j < natureArray.length; ++j) {
if (natureArray[j] != null)
break;
}
List<AtomNode> atomNodeList = quickAtomSegment(charArray, i, j);
for (AtomNode atomNode : atomNodeList) {
if (atomNode.sWord.length() >= wordNet[i]) {
wordNet[i] = atomNode.sWord.length();
natureArray[i] = atomNode.getNature();
i += wordNet[i];
}
}
i = j;
} else {
++i;
}
}
}
for (int i = 0; i < wordNet.length; ) {
Term term = new Term(new String(charArray, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
term.offset = i;
termList.add(term);
i += wordNet[i];
}
return termList;
}
use of com.hankcs.hanlp.seg.NShort.Path.AtomNode in project HanLP by hankcs.
the class Segment method simpleAtomSegment.
/**
* 简易原子分词,将所有字放到一起作为一个词
*
* @param charArray
* @param start
* @param end
* @return
*/
protected static List<AtomNode> simpleAtomSegment(char[] charArray, int start, int end) {
List<AtomNode> atomNodeList = new LinkedList<AtomNode>();
atomNodeList.add(new AtomNode(new String(charArray, start, end - start), Predefine.CT_LETTER));
return atomNodeList;
}
Aggregations