Search in sources :

Example 11 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class NeuralNetworkDependencyParser method parse.

@Override
public CoNLLSentence parse(List<Term> termList) {
    List<String> posTagList = PosTagUtil.to863(termList);
    List<String> wordList = new ArrayList<String>(termList.size());
    for (Term term : termList) {
        wordList.add(term.word);
    }
    List<Integer> heads = new ArrayList<Integer>(termList.size());
    List<String> deprels = new ArrayList<String>(termList.size());
    parser_dll.parse(wordList, posTagList, heads, deprels);
    CoNLLWord[] wordArray = new CoNLLWord[termList.size()];
    for (int i = 0; i < wordArray.length; ++i) {
        wordArray[i] = new CoNLLWord(i + 1, wordList.get(i), posTagList.get(i), termList.get(i).nature.toString());
        wordArray[i].DEPREL = deprels.get(i);
    }
    for (int i = 0; i < wordArray.length; ++i) {
        int index = heads.get(i) - 1;
        if (index < 0) {
            wordArray[i].HEAD = CoNLLWord.ROOT;
            continue;
        }
        wordArray[i].HEAD = wordArray[index];
    }
    return new CoNLLSentence(wordArray);
}
Also used : CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) ArrayList(java.util.ArrayList) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) Term(com.hankcs.hanlp.seg.common.Term)

Example 12 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class PosTagUtil method to863.

/**
     * 转为863标注集<br>
     * 863词性标注集,其各个词性含义如下表:

     Tag	Description	Example	Tag	Description	Example
     a	adjective	美丽	ni	organization name	保险公司
     b	other noun-modifier	大型, 西式	nl	location noun	城郊
     c	conjunction	和, 虽然	ns	geographical name	北京
     d	adverb	很	nt	temporal noun	近日, 明代
     e	exclamation	哎	nz	other proper noun	诺贝尔奖
     g	morpheme	茨, 甥	o	onomatopoeia	哗啦
     h	prefix	阿, 伪	p	preposition	在, 把
     i	idiom	百花齐放	q	quantity	个
     j	abbreviation	公检法	r	pronoun	我们
     k	suffix	界, 率	u	auxiliary	的, 地
     m	number	一, 第一	v	verb	跑, 学习
     n	general noun	苹果	wp	punctuation	,。!
     nd	direction noun	右侧	ws	foreign words	CPU
     nh	person name	杜甫, 汤姆	x	non-lexeme	萄, 翱
     * @param termList
     * @return
     */
public static List<String> to863(List<Term> termList) {
    List<String> posTagList = new ArrayList<String>(termList.size());
    for (Term term : termList) {
        String posTag = "x";
        switch(term.nature) {
            case bg:
                posTag = "b";
                break;
            case mg:
                posTag = "m";
                break;
            case nl:
                posTag = "n";
                break;
            case nx:
                posTag = "ws";
                break;
            case qg:
                posTag = "q";
                break;
            case ud:
            case uj:
            case uz:
            case ug:
            case ul:
            case uv:
                posTag = "u";
                break;
            case yg:
                posTag = "u";
                break;
            case zg:
                posTag = "u";
                break;
            case n:
                posTag = "n";
                break;
            case nr:
            case nrj:
            case nrf:
            case nr1:
            case nr2:
                posTag = "nh";
                break;
            case ns:
            case nsf:
                posTag = "ns";
                break;
            case nt:
            case ntc:
            case ntcf:
            case ntcb:
            case ntch:
            case nto:
            case ntu:
            case nts:
            case nth:
                posTag = "ni";
                break;
            case nh:
            case nhm:
            case nhd:
            case nn:
            case nnt:
            case nnd:
                posTag = "nz";
                break;
            case ng:
                posTag = "n";
                break;
            case nf:
                posTag = "n";
                break;
            case ni:
                posTag = "n";
                break;
            case nit:
            case nic:
            case nis:
                posTag = "nt";
                break;
            case nm:
            case nmc:
            case nb:
            case nba:
            case nbc:
            case nbp:
            case nz:
                posTag = "nz";
                break;
            case g:
            case gm:
            case gp:
            case gc:
            case gb:
            case gbc:
            case gg:
            case gi:
                posTag = "nz";
                break;
            case j:
                posTag = "j";
                break;
            case i:
                posTag = "i";
                break;
            case l:
                posTag = "i";
                break;
            case t:
                posTag = "nt";
                break;
            case tg:
                posTag = "nt";
                break;
            case s:
                posTag = "nl";
                break;
            case f:
                posTag = "nd";
                break;
            case v:
            case vd:
            case vn:
            case vshi:
            case vyou:
            case vf:
            case vx:
            case vi:
            case vl:
            case vg:
                posTag = "v";
                break;
            case a:
            case ad:
            case an:
            case ag:
            case al:
                posTag = "a";
                break;
            case b:
            case bl:
                posTag = "b";
                break;
            case z:
                posTag = "u";
                break;
            case r:
            case rr:
            case rz:
            case rzt:
            case rzs:
            case rzv:
            case ry:
            case ryt:
            case rys:
            case ryv:
            case rg:
            case Rg:
                posTag = "r";
                break;
            case m:
            case mq:
            case Mg:
                posTag = "m";
                break;
            case q:
            case qv:
            case qt:
                posTag = "q";
                break;
            case d:
            case dg:
            case dl:
                posTag = "d";
                break;
            case p:
            case pba:
            case pbei:
                posTag = "p";
                break;
            case c:
            case cc:
                posTag = "c";
                break;
            case u:
            case uzhe:
            case ule:
            case uguo:
            case ude1:
            case ude2:
            case ude3:
            case usuo:
            case udeng:
            case uyy:
            case udh:
            case uls:
            case uzhi:
            case ulian:
                posTag = "u";
                break;
            case e:
                posTag = "e";
                break;
            case y:
                posTag = "e";
                break;
            case o:
                posTag = "o";
                break;
            case h:
                posTag = "h";
                break;
            case k:
                posTag = "k";
                break;
            case x:
            case xx:
            case xu:
                posTag = "x";
                break;
            case w:
            case wkz:
            case wky:
            case wyz:
            case wyy:
            case wj:
            case ww:
            case wt:
            case wd:
            case wf:
            case wn:
            case wm:
            case ws:
            case wp:
            case wb:
            case wh:
                posTag = "wp";
                break;
        }
        posTagList.add(posTag);
    }
    return posTagList;
}
Also used : ArrayList(java.util.ArrayList) Term(com.hankcs.hanlp.seg.common.Term)

Example 13 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class CoreSynonymDictionary method convert.

/**
     * 将分词结果转换为同义词列表
     * @param sentence 句子
     * @param withUndefinedItem 是否保留词典中没有的词语
     * @return
     */
public static List<CommonSynonymDictionary.SynonymItem> convert(List<Term> sentence, boolean withUndefinedItem) {
    List<CommonSynonymDictionary.SynonymItem> synonymItemList = new ArrayList<CommonSynonymDictionary.SynonymItem>(sentence.size());
    for (Term term : sentence) {
        CommonSynonymDictionary.SynonymItem item = get(term.word);
        if (item == null) {
            if (withUndefinedItem) {
                item = CommonSynonymDictionary.SynonymItem.createUndefined(term.word);
                synonymItemList.add(item);
            }
        } else {
            synonymItemList.add(item);
        }
    }
    return synonymItemList;
}
Also used : CommonSynonymDictionary(com.hankcs.hanlp.dictionary.common.CommonSynonymDictionary) ArrayList(java.util.ArrayList) Term(com.hankcs.hanlp.seg.common.Term)

Example 14 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class CRFSegment method toTermList.

/**
     * 将一条路径转为最终结果
     *
     * @param vertexList
     * @param offsetEnabled 是否计算offset
     * @return
     */
protected static List<Term> toTermList(List<Vertex> vertexList, boolean offsetEnabled) {
    assert vertexList != null;
    int length = vertexList.size();
    List<Term> resultList = new ArrayList<Term>(length);
    Iterator<Vertex> iterator = vertexList.iterator();
    if (offsetEnabled) {
        int offset = 0;
        for (int i = 0; i < length; ++i) {
            Vertex vertex = iterator.next();
            Term term = convert(vertex);
            term.offset = offset;
            offset += term.length();
            resultList.add(term);
        }
    } else {
        for (int i = 0; i < length; ++i) {
            Vertex vertex = iterator.next();
            Term term = convert(vertex);
            resultList.add(term);
        }
    }
    return resultList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) Term(com.hankcs.hanlp.seg.common.Term)

Example 15 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class HMMSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    char[] tag = model.tag(sentence);
    List<Term> termList = new LinkedList<Term>();
    int offset = 0;
    for (int i = 0; i < tag.length; offset += 1, ++i) {
        switch(tag[i]) {
            case 'b':
                {
                    int begin = offset;
                    while (tag[i] != 'e') {
                        offset += 1;
                        ++i;
                        if (i == tag.length) {
                            break;
                        }
                    }
                    if (i == tag.length) {
                        termList.add(new Term(new String(sentence, begin, offset - begin), null));
                    } else
                        termList.add(new Term(new String(sentence, begin, offset - begin + 1), null));
                }
                break;
            default:
                {
                    termList.add(new Term(new String(sentence, offset, 1), null));
                }
                break;
        }
    }
    return termList;
}
Also used : Term(com.hankcs.hanlp.seg.common.Term) LinkedList(java.util.LinkedList)

Aggregations

Term (com.hankcs.hanlp.seg.common.Term)48 Segment (com.hankcs.hanlp.seg.Segment)12 DijkstraSegment (com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment)8 LinkedList (java.util.LinkedList)7 CRFSegment (com.hankcs.hanlp.seg.CRF.CRFSegment)5 ResultTerm (com.hankcs.hanlp.seg.common.ResultTerm)5 Vertex (com.hankcs.hanlp.seg.common.Vertex)5 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)4 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)4 DoubleArrayTrieSegment (com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment)4 ViterbiSegment (com.hankcs.hanlp.seg.Viterbi.ViterbiSegment)4 ArrayList (java.util.ArrayList)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)3 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 Filter (com.hankcs.hanlp.dictionary.stopword.Filter)2 Table (com.hankcs.hanlp.model.crf.Table)2 HMMSegment (com.hankcs.hanlp.seg.HMM.HMMSegment)2 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)2 File (java.io.File)2