Search in sources :

Example 1 with Occurrence

use of com.hankcs.hanlp.corpus.occurrence.Occurrence in project HanLP by hankcs.

the class MutualInformationEntropyPhraseExtractor method extractPhrase.

@Override
public List<String> extractPhrase(String text, int size) {
    List<String> phraseList = new LinkedList<String>();
    Occurrence occurrence = new Occurrence();
    Filter[] filterChain = new Filter[] { CoreStopWordDictionary.FILTER, new Filter() {

        @Override
        public boolean shouldInclude(Term term) {
            switch(term.nature) {
                case t:
                case nx:
                    return false;
            }
            return true;
        }
    } };
    for (List<Term> sentence : NotionalTokenizer.seg2sentence(text, filterChain)) {
        if (HanLP.Config.DEBUG) {
            System.out.println(sentence);
        }
        occurrence.addAll(sentence);
    }
    occurrence.compute();
    if (HanLP.Config.DEBUG) {
        System.out.println(occurrence);
        for (PairFrequency phrase : occurrence.getPhraseByMi()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tmi=" + phrase.mi + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByLe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tle=" + phrase.le + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByRe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tre=" + phrase.re + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByScore()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tscore=" + phrase.score + " , ");
        }
        System.out.println();
    }
    for (PairFrequency phrase : occurrence.getPhraseByScore()) {
        if (phraseList.size() == size)
            break;
        phraseList.add(phrase.first + phrase.second);
    }
    return phraseList;
}
Also used : Filter(com.hankcs.hanlp.dictionary.stopword.Filter) Term(com.hankcs.hanlp.seg.common.Term) PairFrequency(com.hankcs.hanlp.corpus.occurrence.PairFrequency) Occurrence(com.hankcs.hanlp.corpus.occurrence.Occurrence) LinkedList(java.util.LinkedList)

Example 2 with Occurrence

use of com.hankcs.hanlp.corpus.occurrence.Occurrence in project HanLP by hankcs.

the class DemoOccurrence method main.

public static void main(String[] args) {
    Occurrence occurrence = new Occurrence();
    occurrence.addAll("在计算机音视频和图形图像技术等二维信息算法处理方面目前比较先进的视频处理算法");
    occurrence.compute();
    Set<Map.Entry<String, TermFrequency>> uniGram = occurrence.getUniGram();
    for (Map.Entry<String, TermFrequency> entry : uniGram) {
        TermFrequency termFrequency = entry.getValue();
        System.out.println(termFrequency);
    }
    Set<Map.Entry<String, PairFrequency>> biGram = occurrence.getBiGram();
    for (Map.Entry<String, PairFrequency> entry : biGram) {
        PairFrequency pairFrequency = entry.getValue();
        if (pairFrequency.isRight())
            System.out.println(pairFrequency);
    }
    Set<Map.Entry<String, TriaFrequency>> triGram = occurrence.getTriGram();
    for (Map.Entry<String, TriaFrequency> entry : triGram) {
        TriaFrequency triaFrequency = entry.getValue();
        if (triaFrequency.isRight())
            System.out.println(triaFrequency);
    }
}
Also used : TermFrequency(com.hankcs.hanlp.corpus.occurrence.TermFrequency) TriaFrequency(com.hankcs.hanlp.corpus.occurrence.TriaFrequency) PairFrequency(com.hankcs.hanlp.corpus.occurrence.PairFrequency) Occurrence(com.hankcs.hanlp.corpus.occurrence.Occurrence) Map(java.util.Map)

Aggregations

Occurrence (com.hankcs.hanlp.corpus.occurrence.Occurrence)2 PairFrequency (com.hankcs.hanlp.corpus.occurrence.PairFrequency)2 TermFrequency (com.hankcs.hanlp.corpus.occurrence.TermFrequency)1 TriaFrequency (com.hankcs.hanlp.corpus.occurrence.TriaFrequency)1 Filter (com.hankcs.hanlp.dictionary.stopword.Filter)1 Term (com.hankcs.hanlp.seg.common.Term)1 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1