Search in sources :

Example 16 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class MutualInformationEntropyPhraseExtractor method extractPhrase.

@Override
public List<String> extractPhrase(String text, int size) {
    List<String> phraseList = new LinkedList<String>();
    Occurrence occurrence = new Occurrence();
    Filter[] filterChain = new Filter[] { CoreStopWordDictionary.FILTER, new Filter() {

        @Override
        public boolean shouldInclude(Term term) {
            switch(term.nature) {
                case t:
                case nx:
                    return false;
            }
            return true;
        }
    } };
    for (List<Term> sentence : NotionalTokenizer.seg2sentence(text, filterChain)) {
        if (HanLP.Config.DEBUG) {
            System.out.println(sentence);
        }
        occurrence.addAll(sentence);
    }
    occurrence.compute();
    if (HanLP.Config.DEBUG) {
        System.out.println(occurrence);
        for (PairFrequency phrase : occurrence.getPhraseByMi()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tmi=" + phrase.mi + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByLe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tle=" + phrase.le + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByRe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tre=" + phrase.re + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByScore()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tscore=" + phrase.score + " , ");
        }
        System.out.println();
    }
    for (PairFrequency phrase : occurrence.getPhraseByScore()) {
        if (phraseList.size() == size)
            break;
        phraseList.add(phrase.first + phrase.second);
    }
    return phraseList;
}
Also used : Filter(com.hankcs.hanlp.dictionary.stopword.Filter) Term(com.hankcs.hanlp.seg.common.Term) PairFrequency(com.hankcs.hanlp.corpus.occurrence.PairFrequency) Occurrence(com.hankcs.hanlp.corpus.occurrence.Occurrence) LinkedList(java.util.LinkedList)

Example 17 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class AhoCorasickDoubleArrayTrieSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    if (trie == null) {
        logger.warning("还未加载任何词典");
        return Collections.emptyList();
    }
    final int[] wordNet = new int[sentence.length];
    Arrays.fill(wordNet, 1);
    final Nature[] natureArray = config.speechTagging ? new Nature[sentence.length] : null;
    trie.parseText(sentence, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {

        @Override
        public void hit(int begin, int end, CoreDictionary.Attribute value) {
            int length = end - begin;
            if (length > wordNet[begin]) {
                wordNet[begin] = length;
                if (config.speechTagging) {
                    natureArray[begin] = value.nature[0];
                }
            }
        }
    });
    LinkedList<Term> termList = new LinkedList<Term>();
    if (config.speechTagging) {
        for (int i = 0; i < natureArray.length; ) {
            if (natureArray[i] == null) {
                int j = i + 1;
                for (; j < natureArray.length; ++j) {
                    if (natureArray[j] != null)
                        break;
                }
                List<AtomNode> atomNodeList = quickAtomSegment(sentence, i, j);
                for (AtomNode atomNode : atomNodeList) {
                    if (atomNode.sWord.length() >= wordNet[i]) {
                        wordNet[i] = atomNode.sWord.length();
                        natureArray[i] = atomNode.getNature();
                        i += wordNet[i];
                    }
                }
                i = j;
            } else {
                ++i;
            }
        }
    }
    for (int i = 0; i < wordNet.length; ) {
        Term term = new Term(new String(sentence, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
        term.offset = i;
        termList.add(term);
        i += wordNet[i];
    }
    return termList;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Example 18 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class Segment method seg.

/**
     * 分词<br>
     * 此方法是线程安全的
     *
     * @param text 待分词文本
     * @return 单词列表
     */
public List<Term> seg(String text) {
    char[] charArray = text.toCharArray();
    if (HanLP.Config.Normalization) {
        CharTable.normalization(charArray);
    }
    if (// 小文本多线程没意义,反而变慢了
    config.threadNumber > 1 && charArray.length > 10000) {
        List<String> sentenceList = SentencesUtil.toSentenceList(charArray);
        String[] sentenceArray = new String[sentenceList.size()];
        sentenceList.toArray(sentenceArray);
        //noinspection unchecked
        List<Term>[] termListArray = new List[sentenceArray.length];
        final int per = sentenceArray.length / config.threadNumber;
        WorkThread[] threadArray = new WorkThread[config.threadNumber];
        for (int i = 0; i < config.threadNumber - 1; ++i) {
            int from = i * per;
            threadArray[i] = new WorkThread(sentenceArray, termListArray, from, from + per);
            threadArray[i].start();
        }
        threadArray[config.threadNumber - 1] = new WorkThread(sentenceArray, termListArray, (config.threadNumber - 1) * per, sentenceArray.length);
        threadArray[config.threadNumber - 1].start();
        try {
            for (WorkThread thread : threadArray) {
                thread.join();
            }
        } catch (InterruptedException e) {
            logger.severe("线程同步异常:" + TextUtility.exceptionToString(e));
            return Collections.emptyList();
        }
        List<Term> termList = new LinkedList<Term>();
        if (// 由于分割了句子,所以需要重新校正offset
        config.offset || config.indexMode) {
            int sentenceOffset = 0;
            for (int i = 0; i < sentenceArray.length; ++i) {
                for (Term term : termListArray[i]) {
                    term.offset += sentenceOffset;
                    termList.add(term);
                }
                sentenceOffset += sentenceArray[i].length();
            }
        } else {
            for (List<Term> list : termListArray) {
                termList.addAll(list);
            }
        }
        return termList;
    }
    //        }
    return segSentence(charArray);
}
Also used : Term(com.hankcs.hanlp.seg.common.Term)

Example 19 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class TextRankSentence method convertSentenceListToDocument.

/**
     * 将句子列表转化为文档
     *
     * @param sentenceList
     * @return
     */
private static List<List<String>> convertSentenceListToDocument(List<String> sentenceList) {
    List<List<String>> docs = new ArrayList<List<String>>(sentenceList.size());
    for (String sentence : sentenceList) {
        List<Term> termList = StandardTokenizer.segment(sentence.toCharArray());
        List<String> wordList = new LinkedList<String>();
        for (Term term : termList) {
            if (CoreStopWordDictionary.shouldInclude(term)) {
                wordList.add(term.word);
            }
        }
        docs.add(wordList);
    }
    return docs;
}
Also used : Term(com.hankcs.hanlp.seg.common.Term)

Example 20 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class TraditionalChineseTokenizer method segSentence.

private static List<Term> segSentence(String text) {
    String sText = CharTable.convert(text);
    List<Term> termList = SEGMENT.seg(sText);
    int offset = 0;
    for (Term term : termList) {
        String tText;
        term.offset = offset;
        if (term.length() == 1 || (tText = SimplifiedChineseDictionary.getTraditionalChinese(term.word)) == null) {
            term.word = text.substring(offset, offset + term.length());
            offset += term.length();
        } else {
            offset += term.length();
            term.word = tText;
        }
    }
    return termList;
}
Also used : Term(com.hankcs.hanlp.seg.common.Term)

Aggregations

Term (com.hankcs.hanlp.seg.common.Term)48 Segment (com.hankcs.hanlp.seg.Segment)12 DijkstraSegment (com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment)8 LinkedList (java.util.LinkedList)7 CRFSegment (com.hankcs.hanlp.seg.CRF.CRFSegment)5 ResultTerm (com.hankcs.hanlp.seg.common.ResultTerm)5 Vertex (com.hankcs.hanlp.seg.common.Vertex)5 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)4 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)4 DoubleArrayTrieSegment (com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment)4 ViterbiSegment (com.hankcs.hanlp.seg.Viterbi.ViterbiSegment)4 ArrayList (java.util.ArrayList)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)3 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 Filter (com.hankcs.hanlp.dictionary.stopword.Filter)2 Table (com.hankcs.hanlp.model.crf.Table)2 HMMSegment (com.hankcs.hanlp.seg.HMM.HMMSegment)2 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)2 File (java.io.File)2