use of com.hankcs.hanlp.dictionary.stopword.Filter in project HanLP by hankcs.
the class MutualInformationEntropyPhraseExtractor method extractPhrase.
@Override
public List<String> extractPhrase(String text, int size) {
List<String> phraseList = new LinkedList<String>();
Occurrence occurrence = new Occurrence();
Filter[] filterChain = new Filter[] { CoreStopWordDictionary.FILTER, new Filter() {
@Override
public boolean shouldInclude(Term term) {
switch(term.nature) {
case t:
case nx:
return false;
}
return true;
}
} };
for (List<Term> sentence : NotionalTokenizer.seg2sentence(text, filterChain)) {
if (HanLP.Config.DEBUG) {
System.out.println(sentence);
}
occurrence.addAll(sentence);
}
occurrence.compute();
if (HanLP.Config.DEBUG) {
System.out.println(occurrence);
for (PairFrequency phrase : occurrence.getPhraseByMi()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tmi=" + phrase.mi + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByLe()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tle=" + phrase.le + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByRe()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tre=" + phrase.re + " , ");
}
System.out.println();
for (PairFrequency phrase : occurrence.getPhraseByScore()) {
System.out.print(phrase.getKey().replace(Occurrence.RIGHT, '→') + "\tscore=" + phrase.score + " , ");
}
System.out.println();
}
for (PairFrequency phrase : occurrence.getPhraseByScore()) {
if (phraseList.size() == size)
break;
phraseList.add(phrase.first + phrase.second);
}
return phraseList;
}
use of com.hankcs.hanlp.dictionary.stopword.Filter in project HanLP by hankcs.
the class DemoStopWord method main.
public static void main(String[] args) {
String text = "小区居民有的反对喂养流浪猫,而有的居民却赞成喂养这些小宝贝";
// 可以动态修改停用词词典
CoreStopWordDictionary.add("居民");
System.out.println(NotionalTokenizer.segment(text));
CoreStopWordDictionary.remove("居民");
System.out.println(NotionalTokenizer.segment(text));
// 可以对任意分词器的结果执行过滤
List<Term> termList = BasicTokenizer.segment(text);
System.out.println(termList);
CoreStopWordDictionary.apply(termList);
System.out.println(termList);
// 还可以自定义过滤逻辑
CoreStopWordDictionary.FILTER = new Filter() {
@Override
public boolean shouldInclude(Term term) {
switch(term.nature) {
case nz:
return !CoreStopWordDictionary.contains(term.word);
}
return false;
}
};
System.out.println(NotionalTokenizer.segment(text));
}
Aggregations