use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class NeuralNetworkDependencyParser method parse.
@Override
public CoNLLSentence parse(List<Term> termList) {
List<String> posTagList = PosTagUtil.to863(termList);
List<String> wordList = new ArrayList<String>(termList.size());
for (Term term : termList) {
wordList.add(term.word);
}
List<Integer> heads = new ArrayList<Integer>(termList.size());
List<String> deprels = new ArrayList<String>(termList.size());
parser_dll.parse(wordList, posTagList, heads, deprels);
CoNLLWord[] wordArray = new CoNLLWord[termList.size()];
for (int i = 0; i < wordArray.length; ++i) {
wordArray[i] = new CoNLLWord(i + 1, wordList.get(i), posTagList.get(i), termList.get(i).nature.toString());
wordArray[i].DEPREL = deprels.get(i);
}
for (int i = 0; i < wordArray.length; ++i) {
int index = heads.get(i) - 1;
if (index < 0) {
wordArray[i].HEAD = CoNLLWord.ROOT;
continue;
}
wordArray[i].HEAD = wordArray[index];
}
return new CoNLLSentence(wordArray);
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class PosTagUtil method to863.
/**
* 转为863标注集<br>
* 863词性标注集,其各个词性含义如下表:
Tag Description Example Tag Description Example
a adjective 美丽 ni organization name 保险公司
b other noun-modifier 大型, 西式 nl location noun 城郊
c conjunction 和, 虽然 ns geographical name 北京
d adverb 很 nt temporal noun 近日, 明代
e exclamation 哎 nz other proper noun 诺贝尔奖
g morpheme 茨, 甥 o onomatopoeia 哗啦
h prefix 阿, 伪 p preposition 在, 把
i idiom 百花齐放 q quantity 个
j abbreviation 公检法 r pronoun 我们
k suffix 界, 率 u auxiliary 的, 地
m number 一, 第一 v verb 跑, 学习
n general noun 苹果 wp punctuation ,。!
nd direction noun 右侧 ws foreign words CPU
nh person name 杜甫, 汤姆 x non-lexeme 萄, 翱
* @param termList
* @return
*/
public static List<String> to863(List<Term> termList) {
List<String> posTagList = new ArrayList<String>(termList.size());
for (Term term : termList) {
String posTag = "x";
switch(term.nature) {
case bg:
posTag = "b";
break;
case mg:
posTag = "m";
break;
case nl:
posTag = "n";
break;
case nx:
posTag = "ws";
break;
case qg:
posTag = "q";
break;
case ud:
case uj:
case uz:
case ug:
case ul:
case uv:
posTag = "u";
break;
case yg:
posTag = "u";
break;
case zg:
posTag = "u";
break;
case n:
posTag = "n";
break;
case nr:
case nrj:
case nrf:
case nr1:
case nr2:
posTag = "nh";
break;
case ns:
case nsf:
posTag = "ns";
break;
case nt:
case ntc:
case ntcf:
case ntcb:
case ntch:
case nto:
case ntu:
case nts:
case nth:
posTag = "ni";
break;
case nh:
case nhm:
case nhd:
case nn:
case nnt:
case nnd:
posTag = "nz";
break;
case ng:
posTag = "n";
break;
case nf:
posTag = "n";
break;
case ni:
posTag = "n";
break;
case nit:
case nic:
case nis:
posTag = "nt";
break;
case nm:
case nmc:
case nb:
case nba:
case nbc:
case nbp:
case nz:
posTag = "nz";
break;
case g:
case gm:
case gp:
case gc:
case gb:
case gbc:
case gg:
case gi:
posTag = "nz";
break;
case j:
posTag = "j";
break;
case i:
posTag = "i";
break;
case l:
posTag = "i";
break;
case t:
posTag = "nt";
break;
case tg:
posTag = "nt";
break;
case s:
posTag = "nl";
break;
case f:
posTag = "nd";
break;
case v:
case vd:
case vn:
case vshi:
case vyou:
case vf:
case vx:
case vi:
case vl:
case vg:
posTag = "v";
break;
case a:
case ad:
case an:
case ag:
case al:
posTag = "a";
break;
case b:
case bl:
posTag = "b";
break;
case z:
posTag = "u";
break;
case r:
case rr:
case rz:
case rzt:
case rzs:
case rzv:
case ry:
case ryt:
case rys:
case ryv:
case rg:
case Rg:
posTag = "r";
break;
case m:
case mq:
case Mg:
posTag = "m";
break;
case q:
case qv:
case qt:
posTag = "q";
break;
case d:
case dg:
case dl:
posTag = "d";
break;
case p:
case pba:
case pbei:
posTag = "p";
break;
case c:
case cc:
posTag = "c";
break;
case u:
case uzhe:
case ule:
case uguo:
case ude1:
case ude2:
case ude3:
case usuo:
case udeng:
case uyy:
case udh:
case uls:
case uzhi:
case ulian:
posTag = "u";
break;
case e:
posTag = "e";
break;
case y:
posTag = "e";
break;
case o:
posTag = "o";
break;
case h:
posTag = "h";
break;
case k:
posTag = "k";
break;
case x:
case xx:
case xu:
posTag = "x";
break;
case w:
case wkz:
case wky:
case wyz:
case wyy:
case wj:
case ww:
case wt:
case wd:
case wf:
case wn:
case wm:
case ws:
case wp:
case wb:
case wh:
posTag = "wp";
break;
}
posTagList.add(posTag);
}
return posTagList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CoreSynonymDictionary method convert.
/**
* 将分词结果转换为同义词列表
* @param sentence 句子
* @param withUndefinedItem 是否保留词典中没有的词语
* @return
*/
public static List<CommonSynonymDictionary.SynonymItem> convert(List<Term> sentence, boolean withUndefinedItem) {
List<CommonSynonymDictionary.SynonymItem> synonymItemList = new ArrayList<CommonSynonymDictionary.SynonymItem>(sentence.size());
for (Term term : sentence) {
CommonSynonymDictionary.SynonymItem item = get(term.word);
if (item == null) {
if (withUndefinedItem) {
item = CommonSynonymDictionary.SynonymItem.createUndefined(term.word);
synonymItemList.add(item);
}
} else {
synonymItemList.add(item);
}
}
return synonymItemList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class CRFSegment method toTermList.
/**
* 将一条路径转为最终结果
*
* @param vertexList
* @param offsetEnabled 是否计算offset
* @return
*/
protected static List<Term> toTermList(List<Vertex> vertexList, boolean offsetEnabled) {
assert vertexList != null;
int length = vertexList.size();
List<Term> resultList = new ArrayList<Term>(length);
Iterator<Vertex> iterator = vertexList.iterator();
if (offsetEnabled) {
int offset = 0;
for (int i = 0; i < length; ++i) {
Vertex vertex = iterator.next();
Term term = convert(vertex);
term.offset = offset;
offset += term.length();
resultList.add(term);
}
} else {
for (int i = 0; i < length; ++i) {
Vertex vertex = iterator.next();
Term term = convert(vertex);
resultList.add(term);
}
}
return resultList;
}
use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.
the class HMMSegment method segSentence.
@Override
protected List<Term> segSentence(char[] sentence) {
char[] tag = model.tag(sentence);
List<Term> termList = new LinkedList<Term>();
int offset = 0;
for (int i = 0; i < tag.length; offset += 1, ++i) {
switch(tag[i]) {
case 'b':
{
int begin = offset;
while (tag[i] != 'e') {
offset += 1;
++i;
if (i == tag.length) {
break;
}
}
if (i == tag.length) {
termList.add(new Term(new String(sentence, begin, offset - begin), null));
} else
termList.add(new Term(new String(sentence, begin, offset - begin + 1), null));
}
break;
default:
{
termList.add(new Term(new String(sentence, offset, 1), null));
}
break;
}
}
return termList;
}
Aggregations