Search in sources :

Example 26 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class CRFSegment method toTermList.

/**
     * 将一条路径转为最终结果
     *
     * @param vertexList
     * @param offsetEnabled 是否计算offset
     * @return
     */
protected static List<Term> toTermList(List<Vertex> vertexList, boolean offsetEnabled) {
    assert vertexList != null;
    int length = vertexList.size();
    List<Term> resultList = new ArrayList<Term>(length);
    Iterator<Vertex> iterator = vertexList.iterator();
    if (offsetEnabled) {
        int offset = 0;
        for (int i = 0; i < length; ++i) {
            Vertex vertex = iterator.next();
            Term term = convert(vertex);
            term.offset = offset;
            offset += term.length();
            resultList.add(term);
        }
    } else {
        for (int i = 0; i < length; ++i) {
            Vertex vertex = iterator.next();
            Term term = convert(vertex);
            resultList.add(term);
        }
    }
    return resultList;
}
Also used : Vertex(com.hankcs.hanlp.seg.common.Vertex) Term(com.hankcs.hanlp.seg.common.Term)

Example 27 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class HMMSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    char[] tag = model.tag(sentence);
    List<Term> termList = new LinkedList<Term>();
    int offset = 0;
    for (int i = 0; i < tag.length; offset += 1, ++i) {
        switch(tag[i]) {
            case 'b':
                {
                    int begin = offset;
                    while (tag[i] != 'e') {
                        offset += 1;
                        ++i;
                        if (i == tag.length) {
                            break;
                        }
                    }
                    if (i == tag.length) {
                        termList.add(new Term(new String(sentence, begin, offset - begin), null));
                    } else
                        termList.add(new Term(new String(sentence, begin, offset - begin + 1), null));
                }
                break;
            default:
                {
                    termList.add(new Term(new String(sentence, offset, 1), null));
                }
                break;
        }
    }
    return termList;
}
Also used : Term(com.hankcs.hanlp.seg.common.Term) LinkedList(java.util.LinkedList)

Example 28 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class MutualInformationEntropyPhraseExtractor method extractPhrase.

@Override
public List<String> extractPhrase(String text, int size) {
    List<String> phraseList = new LinkedList<String>();
    Occurrence occurrence = new Occurrence();
    Filter[] filterChain = new Filter[] { CoreStopWordDictionary.FILTER, new Filter() {

        @Override
        public boolean shouldInclude(Term term) {
            switch(term.nature) {
                case t:
                case nx:
                    return false;
            }
            return true;
        }
    } };
    for (List<Term> sentence : NotionalTokenizer.seg2sentence(text, filterChain)) {
        if (HanLP.Config.DEBUG) {
            System.out.println(sentence);
        }
        occurrence.addAll(sentence);
    }
    occurrence.compute();
    if (HanLP.Config.DEBUG) {
        System.out.println(occurrence);
        for (PairFrequency phrase : occurrence.getPhraseByMi()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tmi=" + phrase.mi + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByLe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tle=" + phrase.le + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByRe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tre=" + phrase.re + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByScore()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tscore=" + phrase.score + " , ");
        }
        System.out.println();
    }
    for (PairFrequency phrase : occurrence.getPhraseByScore()) {
        if (phraseList.size() == size)
            break;
        phraseList.add(phrase.first + phrase.second);
    }
    return phraseList;
}
Also used : Filter(com.hankcs.hanlp.dictionary.stopword.Filter) Term(com.hankcs.hanlp.seg.common.Term) PairFrequency(com.hankcs.hanlp.corpus.occurrence.PairFrequency) Occurrence(com.hankcs.hanlp.corpus.occurrence.Occurrence) LinkedList(java.util.LinkedList)

Example 29 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class MinimumSpanningTreeParser method parse.

@Override
public CoNLLSentence parse(List<Term> termList) {
    if (termList == null || termList.size() == 0)
        return null;
    termList.add(0, new Term("##核心##", Nature.begin));
    Node[] nodeArray = new Node[termList.size()];
    Iterator<Term> iterator = termList.iterator();
    for (int i = 0; i < nodeArray.length; ++i) {
        nodeArray[i] = new Node(iterator.next(), i);
    }
    Edge[][] edges = new Edge[nodeArray.length][nodeArray.length];
    for (int i = 0; i < edges.length; ++i) {
        for (int j = 0; j < edges[i].length; ++j) {
            if (i != j) {
                edges[j][i] = makeEdge(nodeArray, i, j);
            }
        }
    }
    // 最小生成树Prim算法
    int max_v = nodeArray.length * (nodeArray.length - 1);
    float[] mincost = new float[max_v];
    Arrays.fill(mincost, Float.MAX_VALUE / 3);
    boolean[] used = new boolean[max_v];
    Arrays.fill(used, false);
    used[0] = true;
    PriorityQueue<State> que = new PriorityQueue<State>();
    // 找虚根的唯一孩子
    float minCostToRoot = Float.MAX_VALUE;
    Edge firstEdge = null;
    Edge[] edgeResult = new Edge[termList.size() - 1];
    for (Edge edge : edges[0]) {
        if (edge == null)
            continue;
        if (minCostToRoot > edge.cost) {
            firstEdge = edge;
            minCostToRoot = edge.cost;
        }
    }
    if (firstEdge == null)
        return null;
    que.add(new State(minCostToRoot, firstEdge.from, firstEdge));
    while (!que.isEmpty()) {
        State p = que.poll();
        int v = p.id;
        if (used[v] || p.cost > mincost[v])
            continue;
        used[v] = true;
        if (p.edge != null) {
            //                System.out.println(p.edge.from + " " + p.edge.to + p.edge.label);
            edgeResult[p.edge.from - 1] = p.edge;
        }
        for (Edge e : edges[v]) {
            if (e == null)
                continue;
            if (mincost[e.from] > e.cost) {
                mincost[e.from] = e.cost;
                que.add(new State(mincost[e.from], e.from, e));
            }
        }
    }
    CoNLLWord[] wordArray = new CoNLLWord[termList.size() - 1];
    for (int i = 0; i < wordArray.length; ++i) {
        wordArray[i] = new CoNLLWord(i + 1, nodeArray[i + 1].word, nodeArray[i + 1].label);
        wordArray[i].DEPREL = edgeResult[i].label;
    }
    for (int i = 0; i < edgeResult.length; ++i) {
        int index = edgeResult[i].to - 1;
        if (index < 0) {
            wordArray[i].HEAD = CoNLLWord.ROOT;
            continue;
        }
        wordArray[i].HEAD = wordArray[index];
    }
    return new CoNLLSentence(wordArray);
}
Also used : Node(com.hankcs.hanlp.dependency.common.Node) Term(com.hankcs.hanlp.seg.common.Term) PriorityQueue(java.util.PriorityQueue) State(com.hankcs.hanlp.dependency.common.State) CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord) CoNLLSentence(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence) Edge(com.hankcs.hanlp.dependency.common.Edge)

Example 30 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class TextRankKeyword method getRank.

/**
     * 使用已经分好的词来计算rank
     * @param termList
     * @return
     */
public Map<String, Float> getRank(List<Term> termList) {
    List<String> wordList = new ArrayList<String>(termList.size());
    for (Term t : termList) {
        if (shouldInclude(t)) {
            wordList.add(t.word);
        }
    }
    //        System.out.println(wordList);
    Map<String, Set<String>> words = new TreeMap<String, Set<String>>();
    Queue<String> que = new LinkedList<String>();
    for (String w : wordList) {
        if (!words.containsKey(w)) {
            words.put(w, new TreeSet<String>());
        }
        que.offer(w);
        if (que.size() > 5) {
            que.poll();
        }
        for (String w1 : que) {
            for (String w2 : que) {
                if (w1.equals(w2)) {
                    continue;
                }
                words.get(w1).add(w2);
                words.get(w2).add(w1);
            }
        }
    }
    //        System.out.println(words);
    Map<String, Float> score = new HashMap<String, Float>();
    for (int i = 0; i < max_iter; ++i) {
        Map<String, Float> m = new HashMap<String, Float>();
        float max_diff = 0;
        for (Map.Entry<String, Set<String>> entry : words.entrySet()) {
            String key = entry.getKey();
            Set<String> value = entry.getValue();
            m.put(key, 1 - d);
            for (String element : value) {
                int size = words.get(element).size();
                if (key.equals(element) || size == 0)
                    continue;
                m.put(key, m.get(key) + d / size * (score.get(element) == null ? 0 : score.get(element)));
            }
            max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key))));
        }
        score = m;
        if (max_diff <= min_diff)
            break;
    }
    return score;
}
Also used : Term(com.hankcs.hanlp.seg.common.Term)

Aggregations

Term (com.hankcs.hanlp.seg.common.Term)48 Segment (com.hankcs.hanlp.seg.Segment)12 DijkstraSegment (com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment)8 LinkedList (java.util.LinkedList)7 CRFSegment (com.hankcs.hanlp.seg.CRF.CRFSegment)5 ResultTerm (com.hankcs.hanlp.seg.common.ResultTerm)5 Vertex (com.hankcs.hanlp.seg.common.Vertex)5 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)4 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)4 DoubleArrayTrieSegment (com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment)4 ViterbiSegment (com.hankcs.hanlp.seg.Viterbi.ViterbiSegment)4 ArrayList (java.util.ArrayList)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)3 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 Filter (com.hankcs.hanlp.dictionary.stopword.Filter)2 Table (com.hankcs.hanlp.model.crf.Table)2 HMMSegment (com.hankcs.hanlp.seg.HMM.HMMSegment)2 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)2 File (java.io.File)2