Search in sources :

Example 6 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class AbstractProfiler method writeProfileData.

protected void writeProfileData(EvalFilePaths fps, int i, Metadata m, String fileId, String containerId, List<Integer> numAttachments, TableInfo profileTable) {
    Map<Cols, String> data = new HashMap<>();
    data.put(Cols.ID, fileId);
    data.put(Cols.CONTAINER_ID, containerId);
    data.put(Cols.MD5, m.get(DIGEST_KEY));
    if (i < numAttachments.size()) {
        data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
    }
    data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
    data.put(Cols.NUM_METADATA_VALUES, Integer.toString(countMetadataValues(m)));
    Integer nPages = m.getInt(PagedText.N_PAGES);
    if (nPages != null) {
        data.put(Cols.NUM_PAGES, Integer.toString(nPages));
    }
    //if the outer wrapper document
    if (i == 0) {
        data.put(Cols.IS_EMBEDDED, FALSE);
        data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
    } else {
        data.put(Cols.IS_EMBEDDED, TRUE);
        data.put(Cols.FILE_NAME, getFileName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
    }
    String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
    ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
    data.put(Cols.FILE_EXTENSION, ext);
    long srcFileLen = getSourceFileLength(m);
    if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
        data.put(Cols.LENGTH, Long.toString(srcFileLen));
    } else {
        data.put(Cols.LENGTH, "");
    }
    int numMetadataValues = countMetadataValues(m);
    data.put(Cols.NUM_METADATA_VALUES, Integer.toString(numMetadataValues));
    data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
    String content = getContent(m);
    if (content == null || content.trim().length() == 0) {
        data.put(Cols.HAS_CONTENT, FALSE);
    } else {
        data.put(Cols.HAS_CONTENT, TRUE);
    }
    getFileTypes(m, data);
    try {
        writer.writeRow(profileTable, data);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Cols(org.apache.tika.eval.db.Cols) HashMap(java.util.HashMap) IOException(java.io.IOException)

Example 7 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class SimpleComparerTest method debugPrintTable.

private void debugPrintTable(TableInfo tableInfo) {
    List<Map<Cols, String>> table = writer.getTable(tableInfo);
    if (table == null) {
        return;
    }
    int i = 0;
    System.out.println("TABLE: " + tableInfo.getName());
    for (Map<Cols, String> row : table) {
        SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet());
        for (Cols key : keys) {
            System.out.println(i + " :: " + key + " : " + row.get(key));
        }
        i++;
    }
    System.out.println("");
}
Also used : Cols(org.apache.tika.eval.db.Cols) TreeSet(java.util.TreeSet) HashMap(java.util.HashMap) Map(java.util.Map)

Example 8 with Cols

use of org.apache.tika.eval.db.Cols in project tika by apache.

the class ExtractComparer method compareFiles.

//protected for testing, should find better way so that this can be private!
protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOException {
    ExtractReaderException.TYPE extractExceptionA = null;
    ExtractReaderException.TYPE extractExceptionB = null;
    List<Metadata> metadataListA = null;
    if (extractExceptionA == null) {
        try {
            metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
        } catch (ExtractReaderException e) {
            extractExceptionA = e.getType();
        }
    }
    List<Metadata> metadataListB = null;
    try {
        metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
    } catch (ExtractReaderException e) {
        extractExceptionB = e.getType();
    }
    //array indices for those metadata items handled in B
    Set<Integer> handledB = new HashSet<>();
    String containerID = Integer.toString(ID.getAndIncrement());
    //container table
    Map<Cols, String> contData = new HashMap<>();
    contData.put(Cols.CONTAINER_ID, containerID);
    contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString());
    long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
    contData.put(Cols.LENGTH, srcFileLength > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLength) : "");
    contData.put(Cols.FILE_EXTENSION, FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
    long extractFileLengthA = getFileLength(fpsA.getExtractFile());
    contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthA) : "");
    long extractFileLengthB = getFileLength(fpsB.getExtractFile());
    contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthB) : "");
    writer.writeRow(COMPARISON_CONTAINERS, contData);
    if (extractExceptionA != null) {
        writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(), extractExceptionA);
    }
    if (extractExceptionB != null) {
        writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(), extractExceptionB);
    }
    if (metadataListA == null && metadataListB == null) {
        return;
    }
    List<Integer> numAttachmentsA = countAttachments(metadataListA);
    List<Integer> numAttachmentsB = countAttachments(metadataListB);
    //now get that metadata
    if (metadataListA != null) {
        for (int i = 0; i < metadataListA.size(); i++) {
            //the first file should have the same id as the container id
            String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
            Metadata metadataA = metadataListA.get(i);
            Metadata metadataB = null;
            //TODO: shouldn't be fileA!!!!
            writeProfileData(fpsA, i, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
            writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
            int matchIndex = getMatch(i, metadataListA, metadataListB);
            if (matchIndex > -1 && !handledB.contains(matchIndex)) {
                metadataB = metadataListB.get(matchIndex);
                handledB.add(matchIndex);
            }
            if (metadataB != null) {
                writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
            }
            writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
            //prep the token counting
            tokenCounter.clear(FIELD_A);
            tokenCounter.clear(FIELD_B);
            //write content
            try {
                writeContentData(fileId, metadataA, FIELD_A, CONTENTS_TABLE_A);
                writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            //now run comparisons
            if (tokenCounter.getTokenStatistics(FIELD_A).getTotalTokens() > 0 && tokenCounter.getTokenStatistics(FIELD_B).getTotalTokens() > 0) {
                Map<Cols, String> data = new HashMap<>();
                data.put(Cols.ID, fileId);
                ContrastStatistics contrastStatistics = tokenContraster.calculateContrastStatistics(tokenCounter.getTokens(FIELD_A), tokenCounter.getTokenStatistics(FIELD_A), tokenCounter.getTokens(FIELD_B), tokenCounter.getTokenStatistics(FIELD_B));
                writeContrasts(data, contrastStatistics);
                writer.writeRow(CONTENT_COMPARISONS, data);
            }
        }
    }
    //that haven't yet been handled.
    if (metadataListB != null) {
        for (int i = 0; i < metadataListB.size(); i++) {
            if (handledB.contains(i)) {
                continue;
            }
            Metadata metadataB = metadataListB.get(i);
            //the first file should have the same id as the container id
            String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
            writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
            writeEmbeddedFilePathData(i, fileId, null, metadataB);
            writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
            //prep the token counting
            tokenCounter.clear(FIELD_B);
            //write content
            try {
                writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }
}
Also used : HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ExtractReaderException(org.apache.tika.eval.io.ExtractReaderException) ContrastStatistics(org.apache.tika.eval.tokens.ContrastStatistics) Cols(org.apache.tika.eval.db.Cols) HashSet(java.util.HashSet)

Aggregations

HashMap (java.util.HashMap)8 Cols (org.apache.tika.eval.db.Cols)8 IOException (java.io.IOException)4 ExtractReaderException (org.apache.tika.eval.io.ExtractReaderException)3 Metadata (org.apache.tika.metadata.Metadata)3 Map (java.util.Map)2 TikaTest (org.apache.tika.TikaTest)2 Test (org.junit.Test)2 Connection (java.sql.Connection)1 SQLException (java.sql.SQLException)1 HashSet (java.util.HashSet)1 TreeSet (java.util.TreeSet)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 SummaryStatistics (org.apache.commons.math3.stat.descriptive.SummaryStatistics)1 AbstractProfiler (org.apache.tika.eval.AbstractProfiler)1 TableInfo (org.apache.tika.eval.db.TableInfo)1 IDBWriter (org.apache.tika.eval.io.IDBWriter)1 CommonTokenResult (org.apache.tika.eval.tokens.CommonTokenResult)1 ContrastStatistics (org.apache.tika.eval.tokens.ContrastStatistics)1 TokenStatistics (org.apache.tika.eval.tokens.TokenStatistics)1