use of org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute in project CavalliumDBEngine by cavallium.
the class MultiMoreLikeThis method addTermFrequencies.
/**
* Adds term frequencies found by tokenizing text from reader into the Map words
*
* @param r a source of text to be tokenized
* @param perFieldTermFrequencies a Map of terms and their frequencies per field
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map<String, Map<String, Long>> perFieldTermFrequencies, String fieldName) throws IOException {
if (analyzer == null) {
throw new UnsupportedOperationException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
}
Map<String, Long> termFreqMap = perFieldTermFrequencies.computeIfAbsent(fieldName, k -> new HashMap<>());
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
long tokenCount = 0;
// for every token
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
TermFrequencyAttribute tfAtt = ts.addAttribute(TermFrequencyAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String word = termAtt.toString();
tokenCount++;
if (tokenCount > maxNumTokensParsed) {
break;
}
if (isNoiseWord(word)) {
continue;
}
// increment frequency
Long cnt = termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Long(tfAtt.getTermFrequency()));
} else {
cnt.x += tfAtt.getTermFrequency();
}
}
ts.end();
}
}
use of org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute in project OpenSearch by opensearch-project.
the class RankFeatureFieldMapperTests method getFrequency.
static int getFrequency(TokenStream tk) throws IOException {
TermFrequencyAttribute freqAttribute = tk.addAttribute(TermFrequencyAttribute.class);
tk.reset();
assertTrue(tk.incrementToken());
int freq = freqAttribute.getTermFrequency();
assertFalse(tk.incrementToken());
return freq;
}
Aggregations