Search in sources :

Example 1 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class SimpleComparerTest method testGetContent.

@Test
public void testGetContent() throws Exception {
    Metadata m = new Metadata();
    m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
    Map<Cols, String> data = new HashMap<>();
    String content = getContent(m, 10, data);
    assertEquals(10, content.length());
    assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
    content = getContent(m, 4, data);
    assertEquals(4, content.length());
    assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
    //test Metadata with no content
    content = getContent(new Metadata(), 10, data);
    assertEquals(0, content.length());
    assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
    //test null Metadata
    content = getContent(null, 10, data);
    assertEquals(0, content.length());
    assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
}
Also used : Cols(org.apache.tika.eval.db.Cols) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 2 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class SimpleComparerTest method testAccessException.

@Test
public void testAccessException() throws Exception {
    EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), getResourceAsFile("/test-dirs/extractsA/file6_accessEx.pdf.json").toPath());
    EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), getResourceAsFile("/test-dirs/extractsB/file6_accessEx.pdf.json").toPath());
    comparer.compareFiles(fpsA, fpsB);
    for (TableInfo t : new TableInfo[] { ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B }) {
        List<Map<Cols, String>> table = writer.getTable(t);
        Map<Cols, String> rowA = table.get(0);
        //debugPrintRow(rowA);
        assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()), rowA.get(Cols.PARSE_EXCEPTION_ID));
        assertNull(rowA.get(Cols.ORIG_STACK_TRACE));
        assertNull(rowA.get(Cols.SORT_STACK_TRACE));
    }
}
Also used : Cols(org.apache.tika.eval.db.Cols) TableInfo(org.apache.tika.eval.db.TableInfo) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 3 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class EvalConsumerBuilder method populateRefTables.

public void populateRefTables() throws IOException, SQLException {
    //test for one ref table.  If it exists, don't populate ref tables
    //TODO: test one at a time
    boolean tableExists = false;
    try (Connection connection = dbUtil.getConnection()) {
        Set<String> tables = dbUtil.getTables(connection);
        if (tables.contains(AbstractProfiler.REF_PARSE_ERROR_TYPES.getName().toLowerCase(Locale.US))) {
            tableExists = true;
        }
    } catch (SQLException e) {
    //swallow
    }
    if (tableExists) {
        return;
    }
    IDBWriter writer = getDBWriter(getRefTableInfos());
    Map<Cols, String> m = new HashMap<>();
    for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
        m.clear();
        m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
        m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
        writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
    }
    for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
        m.clear();
        m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
        m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
        writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
    }
    for (ExtractReaderException.TYPE t : ExtractReaderException.TYPE.values()) {
        m.clear();
        m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
        m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
        writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
    }
    writer.close();
}
Also used : SQLException(java.sql.SQLException) HashMap(java.util.HashMap) IDBWriter(org.apache.tika.eval.io.IDBWriter) Connection(java.sql.Connection) ExtractReaderException(org.apache.tika.eval.io.ExtractReaderException) Cols(org.apache.tika.eval.db.Cols) AbstractProfiler(org.apache.tika.eval.AbstractProfiler)

Example 4 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class ExtractProfiler method processFileResource.

@Override
public boolean processFileResource(FileResource fileResource) {
    Metadata metadata = fileResource.getMetadata();
    EvalFilePaths fps = null;
    if (inputDir != null && inputDir.equals(extracts)) {
        //crawling an extract dir
        fps = getPathsFromExtractCrawl(metadata, extracts);
    } else {
        fps = getPathsFromSrcCrawl(metadata, inputDir, extracts);
    }
    int containerId = ID.incrementAndGet();
    String containerIdString = Integer.toString(containerId);
    ExtractReaderException.TYPE extractExceptionType = null;
    List<Metadata> metadataList = null;
    try {
        metadataList = extractReader.loadExtract(fps.getExtractFile());
    } catch (ExtractReaderException e) {
        extractExceptionType = e.getType();
    }
    Map<Cols, String> contOutput = new HashMap<>();
    Long srcFileLen = getSourceFileLength(fps, metadataList);
    contOutput.put(Cols.LENGTH, srcFileLen > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLen) : "");
    contOutput.put(Cols.CONTAINER_ID, containerIdString);
    contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString());
    if (fps.getExtractFileLength() > 0) {
        contOutput.put(Cols.EXTRACT_FILE_LENGTH, (fps.getExtractFile() == null) ? "" : Long.toString(fps.getExtractFileLength()));
    }
    try {
        writer.writeRow(CONTAINER_TABLE, contOutput);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    if (extractExceptionType != null) {
        try {
            writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString, fps.getRelativeSourceFilePath().toString(), extractExceptionType);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return true;
    }
    List<Integer> numAttachments = countAttachments(metadataList);
    int i = 0;
    for (Metadata m : metadataList) {
        //the first file should have the same id as the container id
        String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
        writeProfileData(fps, i, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
        writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
        writeExceptionData(fileId, m, EXCEPTION_TABLE);
        try {
            writeContentData(fileId, m, FIELD, CONTENTS_TABLE);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        i++;
    }
    return true;
}
Also used : HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ExtractReaderException(org.apache.tika.eval.io.ExtractReaderException) Cols(org.apache.tika.eval.db.Cols)

Example 5 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class AbstractProfiler method writeContentData.

/**
     * Checks to see if metadata is null or content is empty (null or only whitespace).
     * If any of these, then this does no processing, and the fileId is not
     * entered into the content table.
     *
     * @param fileId
     * @param m
     * @param fieldName
     * @param contentsTable
     */
protected void writeContentData(String fileId, Metadata m, String fieldName, TableInfo contentsTable) throws IOException {
    if (m == null) {
        return;
    }
    Map<Cols, String> data = new HashMap<>();
    String content = getContent(m, maxContentLength, data);
    if (content == null || content.trim().length() == 0) {
        return;
    }
    tokenCounter.clear(fieldName);
    tokenCounter.add(fieldName, content);
    data.put(Cols.ID, fileId);
    data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
    langid(m, data);
    String langid = data.get(Cols.LANG_ID_1);
    langid = (langid == null) ? "" : langid;
    writeTokenCounts(data, fieldName, tokenCounter);
    CommonTokenResult commonTokenResult = null;
    try {
        commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, tokenCounter.getTokens(fieldName));
    } catch (IOException e) {
        LOG.error("{}", e.getMessage(), e);
    }
    data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
    data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
    TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
    data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens()));
    data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens()));
    data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens()));
    data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy()));
    SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
    data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
    data.put(Cols.TOKEN_LENGTH_MEAN, Double.toString(summStats.getMean()));
    data.put(Cols.TOKEN_LENGTH_STD_DEV, Double.toString(summStats.getStandardDeviation()));
    unicodeBlocks(m, data);
    try {
        writer.writeRow(contentsTable, data);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : Cols(org.apache.tika.eval.db.Cols) CommonTokenResult(org.apache.tika.eval.tokens.CommonTokenResult) HashMap(java.util.HashMap) SummaryStatistics(org.apache.commons.math3.stat.descriptive.SummaryStatistics) TokenStatistics(org.apache.tika.eval.tokens.TokenStatistics) IOException(java.io.IOException)

Aggregations

HashMap (java.util.HashMap)8 Cols (org.apache.tika.eval.db.Cols)8 IOException (java.io.IOException)4 ExtractReaderException (org.apache.tika.eval.io.ExtractReaderException)3 Metadata (org.apache.tika.metadata.Metadata)3 Map (java.util.Map)2 TikaTest (org.apache.tika.TikaTest)2 Test (org.junit.Test)2 Connection (java.sql.Connection)1 SQLException (java.sql.SQLException)1 HashSet (java.util.HashSet)1 TreeSet (java.util.TreeSet)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 SummaryStatistics (org.apache.commons.math3.stat.descriptive.SummaryStatistics)1 AbstractProfiler (org.apache.tika.eval.AbstractProfiler)1 TableInfo (org.apache.tika.eval.db.TableInfo)1 IDBWriter (org.apache.tika.eval.io.IDBWriter)1 CommonTokenResult (org.apache.tika.eval.tokens.CommonTokenResult)1 ContrastStatistics (org.apache.tika.eval.tokens.ContrastStatistics)1 TokenStatistics (org.apache.tika.eval.tokens.TokenStatistics)1