Search in sources :

Example 1 with Filter

use of com.hankcs.hanlp.dictionary.stopword.Filter in project HanLP by hankcs.

the class MutualInformationEntropyPhraseExtractor method extractPhrase.

@Override
public List<String> extractPhrase(String text, int size) {
    List<String> phraseList = new LinkedList<String>();
    Occurrence occurrence = new Occurrence();
    Filter[] filterChain = new Filter[] { CoreStopWordDictionary.FILTER, new Filter() {

        @Override
        public boolean shouldInclude(Term term) {
            switch(term.nature) {
                case t:
                case nx:
                    return false;
            }
            return true;
        }
    } };
    for (List<Term> sentence : NotionalTokenizer.seg2sentence(text, filterChain)) {
        if (HanLP.Config.DEBUG) {
            System.out.println(sentence);
        }
        occurrence.addAll(sentence);
    }
    occurrence.compute();
    if (HanLP.Config.DEBUG) {
        System.out.println(occurrence);
        for (PairFrequency phrase : occurrence.getPhraseByMi()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tmi=" + phrase.mi + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByLe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tle=" + phrase.le + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByRe()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tre=" + phrase.re + " , ");
        }
        System.out.println();
        for (PairFrequency phrase : occurrence.getPhraseByScore()) {
            System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tscore=" + phrase.score + " , ");
        }
        System.out.println();
    }
    for (PairFrequency phrase : occurrence.getPhraseByScore()) {
        if (phraseList.size() == size)
            break;
        phraseList.add(phrase.first + phrase.second);
    }
    return phraseList;
}
Also used : Filter(com.hankcs.hanlp.dictionary.stopword.Filter) Term(com.hankcs.hanlp.seg.common.Term) PairFrequency(com.hankcs.hanlp.corpus.occurrence.PairFrequency) Occurrence(com.hankcs.hanlp.corpus.occurrence.Occurrence) LinkedList(java.util.LinkedList)

Example 2 with Filter

use of com.hankcs.hanlp.dictionary.stopword.Filter in project HanLP by hankcs.

the class DemoStopWord method main.

public static void main(String[] args) {
    String text = "小区居民有的反对喂养流浪猫,而有的居民却赞成喂养这些小宝贝";
    // 可以动态修改停用词词典
    CoreStopWordDictionary.add("居民");
    System.out.println(NotionalTokenizer.segment(text));
    CoreStopWordDictionary.remove("居民");
    System.out.println(NotionalTokenizer.segment(text));
    // 可以对任意分词器的结果执行过滤
    List<Term> termList = BasicTokenizer.segment(text);
    System.out.println(termList);
    CoreStopWordDictionary.apply(termList);
    System.out.println(termList);
    // 还可以自定义过滤逻辑
    CoreStopWordDictionary.FILTER = new Filter() {

        @Override
        public boolean shouldInclude(Term term) {
            switch(term.nature) {
                case nz:
                    return !CoreStopWordDictionary.contains(term.word);
            }
            return false;
        }
    };
    System.out.println(NotionalTokenizer.segment(text));
}
Also used : Filter(com.hankcs.hanlp.dictionary.stopword.Filter) Term(com.hankcs.hanlp.seg.common.Term)

Aggregations

Filter (com.hankcs.hanlp.dictionary.stopword.Filter)2 Term (com.hankcs.hanlp.seg.common.Term)2 Occurrence (com.hankcs.hanlp.corpus.occurrence.Occurrence)1 PairFrequency (com.hankcs.hanlp.corpus.occurrence.PairFrequency)1 LinkedList (java.util.LinkedList)1