Search in sources :

Example 1 with CommonTokenResult

use of org.apache.tika.eval.tokens.CommonTokenResult in project tika by apache.

the class AbstractProfiler method writeContentData.

/**
     * Checks to see if metadata is null or content is empty (null or only whitespace).
     * If any of these, then this does no processing, and the fileId is not
     * entered into the content table.
     *
     * @param fileId
     * @param m
     * @param fieldName
     * @param contentsTable
     */
protected void writeContentData(String fileId, Metadata m, String fieldName, TableInfo contentsTable) throws IOException {
    if (m == null) {
        return;
    }
    Map<Cols, String> data = new HashMap<>();
    String content = getContent(m, maxContentLength, data);
    if (content == null || content.trim().length() == 0) {
        return;
    }
    tokenCounter.clear(fieldName);
    tokenCounter.add(fieldName, content);
    data.put(Cols.ID, fileId);
    data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
    langid(m, data);
    String langid = data.get(Cols.LANG_ID_1);
    langid = (langid == null) ? "" : langid;
    writeTokenCounts(data, fieldName, tokenCounter);
    CommonTokenResult commonTokenResult = null;
    try {
        commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, tokenCounter.getTokens(fieldName));
    } catch (IOException e) {
        LOG.error("{}", e.getMessage(), e);
    }
    data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
    data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
    TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
    data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens()));
    data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens()));
    data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens()));
    data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy()));
    SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
    data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
    data.put(Cols.TOKEN_LENGTH_MEAN, Double.toString(summStats.getMean()));
    data.put(Cols.TOKEN_LENGTH_STD_DEV, Double.toString(summStats.getStandardDeviation()));
    unicodeBlocks(m, data);
    try {
        writer.writeRow(contentsTable, data);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : Cols(org.apache.tika.eval.db.Cols) CommonTokenResult(org.apache.tika.eval.tokens.CommonTokenResult) HashMap(java.util.HashMap) SummaryStatistics(org.apache.commons.math3.stat.descriptive.SummaryStatistics) TokenStatistics(org.apache.tika.eval.tokens.TokenStatistics) IOException(java.io.IOException)

Aggregations

IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 SummaryStatistics (org.apache.commons.math3.stat.descriptive.SummaryStatistics)1 Cols (org.apache.tika.eval.db.Cols)1 CommonTokenResult (org.apache.tika.eval.tokens.CommonTokenResult)1 TokenStatistics (org.apache.tika.eval.tokens.TokenStatistics)1