use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class DoubleArrayTrieSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
char[] charArray = sentence;
final int[] wordNet = new int[charArray.length];
Arrays.fill(wordNet, 1);
final Nature[] natureArray = config.speechTagging ? new Nature[charArray.length] : null;
DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(sentence, 0);
while (searcher.next()) {
int length = searcher.length;
if (length > wordNet[searcher.begin]) {
wordNet[searcher.begin] = length;
if (config.speechTagging) {
natureArray[searcher.begin] = searcher.value.nature[0];
}
}
}
if (config.useCustomDictionary) {
CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {
@Override
public void hit(int begin, int end, CoreDictionary.Attribute value) {
int length = end - begin;
if (length > wordNet[begin]) {
wordNet[begin] = length;
if (config.speechTagging) {
natureArray[begin] = value.nature[0];
}
}
}
});
}
LinkedList<Term> termList = new LinkedList<Term>();
if (config.speechTagging) {
for (int i = 0; i < natureArray.length; ) {
if (natureArray[i] == null) {
int j = i + 1;
for (; j < natureArray.length; ++j) {
if (natureArray[j] != null)
break;
}
List<AtomNode> atomNodeList = quickAtomSegment(charArray, i, j);
for (AtomNode atomNode : atomNodeList) {
if (atomNode.sWord.length() >= wordNet[i]) {
wordNet[i] = atomNode.sWord.length();
natureArray[i] = atomNode.getNature();
i += wordNet[i];
}
}
i = j;
} else {
++i;
}
}
}
for (int i = 0; i < wordNet.length; ) {
Term term = new Term(new String(charArray, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
term.offset = i;
termList.add(term);
i += wordNet[i];
}
return termList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CoreSynonymDictionaryEx method convert.
/**
* 将分词结果转换为同义词列表
* @param sentence 句子
* @param withUndefinedItem 是否保留词典中没有的词语
* @return
*/
public static List<Long[]> convert(List<Term> sentence, boolean withUndefinedItem) {
List<Long[]> synonymItemList = new ArrayList<Long[]>(sentence.size());
for (Term term : sentence) {
// 除掉停用词
if (term.nature == null)
continue;
String nature = term.nature.toString();
char firstChar = nature.charAt(0);
switch(firstChar) {
case 'm':
{
if (!TextUtility.isAllChinese(term.word))
continue;
}
break;
case 'w':
{
continue;
}
}
// 停用词
if (CoreStopWordDictionary.contains(term.word))
continue;
Long[] item = get(term.word);
// logger.trace("{} {}", wordResult.word, Arrays.toString(item));
if (item == null) {
if (withUndefinedItem) {
item = new Long[] { Long.MAX_VALUE / 3 };
synonymItemList.add(item);
}
} else {
synonymItemList.add(item);
}
}
return synonymItemList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CommonSynonymDictionary method rewrite.
public String rewrite(String text) {
List<Term> termList = StandardTokenizer.segment(text.toCharArray());
StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
String preWord = Predefine.TAG_BIGIN;
for (Term term : termList) {
SynonymItem synonymItem = get(term.word);
Synonym synonym;
if (synonymItem != null && (synonym = synonymItem.randomSynonym(Type.EQUAL, preWord)) != null) {
sbOut.append(synonym.realWord);
} else
sbOut.append(term.word);
preWord = PosTagCompiler.compile(term.nature.toString(), term.word);
}
return sbOut.toString();
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CRFSegment method toVertexList.
private static List<Vertex> toVertexList(List<Term> termList, boolean appendStart) {
ArrayList<Vertex> vertexList = new ArrayList<Vertex>(termList.size() + 1);
if (appendStart)
vertexList.add(Vertex.B);
for (Term term : termList) {
CoreDictionary.Attribute attribute = CoreDictionary.get(term.word);
if (attribute == null) {
if (term.word.trim().length() == 0)
attribute = new CoreDictionary.Attribute(Nature.x);
else
attribute = new CoreDictionary.Attribute(Nature.nz);
} else
term.nature = attribute.nature[0];
Vertex vertex = new Vertex(term.word, attribute);
vertexList.add(vertex);
}
return vertexList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CRFSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
if (sentence.length == 0)
return Collections.emptyList();
char[] sentenceConverted = CharTable.convert(sentence);
Table table = new Table();
table.v = atomSegmentToTable(sentenceConverted);
crfModel.tag(table);
List<Term> termList = new LinkedList<Term>();
if (HanLP.Config.DEBUG) {
System.out.println("CRF标注结果");
System.out.println(table);
}
int offset = 0;
OUTER: for (int i = 0; i < table.v.length; offset += table.v[i][1].length(), ++i) {
String[] line = table.v[i];
switch(line[2].charAt(0)) {
case 'B':
{
int begin = offset;
while (table.v[i][2].charAt(0) != 'E') {
offset += table.v[i][1].length();
++i;
if (i == table.v.length) {
break;
}
}
if (i == table.v.length) {
termList.add(new Term(new String(sentence, begin, offset - begin), null));
break OUTER;
} else
termList.add(new Term(new String(sentence, begin, offset - begin + table.v[i][1].length()), null));
}
break;
default:
{
termList.add(new Term(new String(sentence, offset, table.v[i][1].length()), null));
}
break;
}
}
if (config.speechTagging) {
List<Vertex> vertexList = toVertexList(termList, true);
Viterbi.compute(vertexList, CoreDictionaryTransformMatrixDictionary.transformMatrixDictionary);
int i = 0;
for (Term term : termList) {
if (term.nature != null)
term.nature = vertexList.get(i + 1).guessNature();
++i;
}
}
if (config.useCustomDictionary) {
List<Vertex> vertexList = toVertexList(termList, false);
combineByCustomDictionary(vertexList);
termList = toTermList(vertexList, config.offset);
}
return termList;
}
Aggregations