use of weka.filters.unsupervised.attribute.StringToWordVector in project cia by Hack23.
the class WordCounterImpl method calculateWordCount.
@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {
final String html = documentContentData.getContent();
final Attribute input = new Attribute(HTML, (ArrayList<String>) null);
final ArrayList<Attribute> inputVec = new ArrayList<>();
inputVec.add(input);
final Instances htmlInst = new Instances(HTML, inputVec, 1);
htmlInst.add(new DenseInstance(1));
htmlInst.instance(0).setValue(0, html);
final StopwordsHandler stopwordsHandler = new StopwordsHandler() {
@Override
public boolean isStopword(final String word) {
return word.length() < 5;
}
};
final NGramTokenizer tokenizer = new NGramTokenizer();
tokenizer.setNGramMinSize(1);
tokenizer.setNGramMaxSize(1);
tokenizer.setDelimiters(TOKEN_DELIMITERS);
final StringToWordVector filter = new StringToWordVector();
filter.setTokenizer(tokenizer);
filter.setStopwordsHandler(stopwordsHandler);
filter.setLowerCaseTokens(true);
filter.setOutputWordCounts(true);
filter.setWordsToKeep(maxResult);
final Map<String, Integer> result = new HashMap<>();
try {
filter.setInputFormat(htmlInst);
final Instances dataFiltered = Filter.useFilter(htmlInst, filter);
final Instance last = dataFiltered.lastInstance();
final int numAttributes = last.numAttributes();
for (int i = 0; i < numAttributes; i++) {
result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
}
} catch (final Exception e) {
LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e);
}
return result;
}
Aggregations