use of org.apache.tika.eval.tokens.TokenStatistics in project tika by apache.
the class AbstractProfiler method writeContentData.
/**
* Checks to see if metadata is null or content is empty (null or only whitespace).
* If any of these, then this does no processing, and the fileId is not
* entered into the content table.
*
* @param fileId
* @param m
* @param fieldName
* @param contentsTable
*/
protected void writeContentData(String fileId, Metadata m, String fieldName, TableInfo contentsTable) throws IOException {
if (m == null) {
return;
}
Map<Cols, String> data = new HashMap<>();
String content = getContent(m, maxContentLength, data);
if (content == null || content.trim().length() == 0) {
return;
}
tokenCounter.clear(fieldName);
tokenCounter.add(fieldName, content);
data.put(Cols.ID, fileId);
data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
langid(m, data);
String langid = data.get(Cols.LANG_ID_1);
langid = (langid == null) ? "" : langid;
writeTokenCounts(data, fieldName, tokenCounter);
CommonTokenResult commonTokenResult = null;
try {
commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, tokenCounter.getTokens(fieldName));
} catch (IOException e) {
LOG.error("{}", e.getMessage(), e);
}
data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens()));
data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens()));
data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens()));
data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy()));
SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
data.put(Cols.TOKEN_LENGTH_MEAN, Double.toString(summStats.getMean()));
data.put(Cols.TOKEN_LENGTH_STD_DEV, Double.toString(summStats.getStandardDeviation()));
unicodeBlocks(m, data);
try {
writer.writeRow(contentsTable, data);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.tika.eval.tokens.TokenStatistics in project tika by apache.
the class AbstractProfiler method writeTokenCounts.
void writeTokenCounts(Map<Cols, String> data, String field, TokenCounter tokenCounter) {
int stops = 0;
int i = 0;
StringBuilder sb = new StringBuilder();
TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(field);
for (TokenIntPair t : tokenStatistics.getTopN()) {
if (i++ > 0) {
sb.append(" | ");
}
sb.append(t.getToken() + ": " + t.getValue());
}
data.put(Cols.TOP_N_TOKENS, sb.toString());
}
Aggregations