Search in sources :

Example 1 with TokenStatistics

use of org.apache.tika.eval.tokens.TokenStatistics in project tika by apache.

the class AbstractProfiler method writeContentData.

/**
     * Checks to see if metadata is null or content is empty (null or only whitespace).
     * If any of these, then this does no processing, and the fileId is not
     * entered into the content table.
     *
     * @param fileId
     * @param m
     * @param fieldName
     * @param contentsTable
     */
protected void writeContentData(String fileId, Metadata m, String fieldName, TableInfo contentsTable) throws IOException {
    if (m == null) {
        return;
    }
    Map<Cols, String> data = new HashMap<>();
    String content = getContent(m, maxContentLength, data);
    if (content == null || content.trim().length() == 0) {
        return;
    }
    tokenCounter.clear(fieldName);
    tokenCounter.add(fieldName, content);
    data.put(Cols.ID, fileId);
    data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
    langid(m, data);
    String langid = data.get(Cols.LANG_ID_1);
    langid = (langid == null) ? "" : langid;
    writeTokenCounts(data, fieldName, tokenCounter);
    CommonTokenResult commonTokenResult = null;
    try {
        commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, tokenCounter.getTokens(fieldName));
    } catch (IOException e) {
        LOG.error("{}", e.getMessage(), e);
    }
    data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
    data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
    TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
    data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens()));
    data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens()));
    data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens()));
    data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy()));
    SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
    data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
    data.put(Cols.TOKEN_LENGTH_MEAN, Double.toString(summStats.getMean()));
    data.put(Cols.TOKEN_LENGTH_STD_DEV, Double.toString(summStats.getStandardDeviation()));
    unicodeBlocks(m, data);
    try {
        writer.writeRow(contentsTable, data);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : Cols(org.apache.tika.eval.db.Cols) CommonTokenResult(org.apache.tika.eval.tokens.CommonTokenResult) HashMap(java.util.HashMap) SummaryStatistics(org.apache.commons.math3.stat.descriptive.SummaryStatistics) TokenStatistics(org.apache.tika.eval.tokens.TokenStatistics) IOException(java.io.IOException)

Example 2 with TokenStatistics

use of org.apache.tika.eval.tokens.TokenStatistics in project tika by apache.

the class AbstractProfiler method writeTokenCounts.

void writeTokenCounts(Map<Cols, String> data, String field, TokenCounter tokenCounter) {
    int stops = 0;
    int i = 0;
    StringBuilder sb = new StringBuilder();
    TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(field);
    for (TokenIntPair t : tokenStatistics.getTopN()) {
        if (i++ > 0) {
            sb.append(" | ");
        }
        sb.append(t.getToken() + ": " + t.getValue());
    }
    data.put(Cols.TOP_N_TOKENS, sb.toString());
}
Also used : TokenStatistics(org.apache.tika.eval.tokens.TokenStatistics) TokenIntPair(org.apache.tika.eval.tokens.TokenIntPair)

Aggregations

TokenStatistics (org.apache.tika.eval.tokens.TokenStatistics)2 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 SummaryStatistics (org.apache.commons.math3.stat.descriptive.SummaryStatistics)1 Cols (org.apache.tika.eval.db.Cols)1 CommonTokenResult (org.apache.tika.eval.tokens.CommonTokenResult)1 TokenIntPair (org.apache.tika.eval.tokens.TokenIntPair)1