use of org.apache.tika.eval.db.Cols in project tika by apache.
the class AbstractProfiler method writeProfileData.
protected void writeProfileData(EvalFilePaths fps, int i, Metadata m, String fileId, String containerId, List<Integer> numAttachments, TableInfo profileTable) {
Map<Cols, String> data = new HashMap<>();
data.put(Cols.ID, fileId);
data.put(Cols.CONTAINER_ID, containerId);
data.put(Cols.MD5, m.get(DIGEST_KEY));
if (i < numAttachments.size()) {
data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
}
data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
data.put(Cols.NUM_METADATA_VALUES, Integer.toString(countMetadataValues(m)));
Integer nPages = m.getInt(PagedText.N_PAGES);
if (nPages != null) {
data.put(Cols.NUM_PAGES, Integer.toString(nPages));
}
//if the outer wrapper document
if (i == 0) {
data.put(Cols.IS_EMBEDDED, FALSE);
data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
} else {
data.put(Cols.IS_EMBEDDED, TRUE);
data.put(Cols.FILE_NAME, getFileName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
}
String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
data.put(Cols.FILE_EXTENSION, ext);
long srcFileLen = getSourceFileLength(m);
if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
data.put(Cols.LENGTH, Long.toString(srcFileLen));
} else {
data.put(Cols.LENGTH, "");
}
int numMetadataValues = countMetadataValues(m);
data.put(Cols.NUM_METADATA_VALUES, Integer.toString(numMetadataValues));
data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
String content = getContent(m);
if (content == null || content.trim().length() == 0) {
data.put(Cols.HAS_CONTENT, FALSE);
} else {
data.put(Cols.HAS_CONTENT, TRUE);
}
getFileTypes(m, data);
try {
writer.writeRow(profileTable, data);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.tika.eval.db.Cols in project tika by apache.
the class SimpleComparerTest method debugPrintTable.
private void debugPrintTable(TableInfo tableInfo) {
List<Map<Cols, String>> table = writer.getTable(tableInfo);
if (table == null) {
return;
}
int i = 0;
System.out.println("TABLE: " + tableInfo.getName());
for (Map<Cols, String> row : table) {
SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet());
for (Cols key : keys) {
System.out.println(i + " :: " + key + " : " + row.get(key));
}
i++;
}
System.out.println("");
}
use of org.apache.tika.eval.db.Cols in project tika by apache.
the class ExtractComparer method compareFiles.
//protected for testing, should find better way so that this can be private!
protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOException {
ExtractReaderException.TYPE extractExceptionA = null;
ExtractReaderException.TYPE extractExceptionB = null;
List<Metadata> metadataListA = null;
if (extractExceptionA == null) {
try {
metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
} catch (ExtractReaderException e) {
extractExceptionA = e.getType();
}
}
List<Metadata> metadataListB = null;
try {
metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
} catch (ExtractReaderException e) {
extractExceptionB = e.getType();
}
//array indices for those metadata items handled in B
Set<Integer> handledB = new HashSet<>();
String containerID = Integer.toString(ID.getAndIncrement());
//container table
Map<Cols, String> contData = new HashMap<>();
contData.put(Cols.CONTAINER_ID, containerID);
contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString());
long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
contData.put(Cols.LENGTH, srcFileLength > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLength) : "");
contData.put(Cols.FILE_EXTENSION, FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
long extractFileLengthA = getFileLength(fpsA.getExtractFile());
contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthA) : "");
long extractFileLengthB = getFileLength(fpsB.getExtractFile());
contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthB) : "");
writer.writeRow(COMPARISON_CONTAINERS, contData);
if (extractExceptionA != null) {
writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(), extractExceptionA);
}
if (extractExceptionB != null) {
writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(), extractExceptionB);
}
if (metadataListA == null && metadataListB == null) {
return;
}
List<Integer> numAttachmentsA = countAttachments(metadataListA);
List<Integer> numAttachmentsB = countAttachments(metadataListB);
//now get that metadata
if (metadataListA != null) {
for (int i = 0; i < metadataListA.size(); i++) {
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
Metadata metadataA = metadataListA.get(i);
Metadata metadataB = null;
//TODO: shouldn't be fileA!!!!
writeProfileData(fpsA, i, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
int matchIndex = getMatch(i, metadataListA, metadataListB);
if (matchIndex > -1 && !handledB.contains(matchIndex)) {
metadataB = metadataListB.get(matchIndex);
handledB.add(matchIndex);
}
if (metadataB != null) {
writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
}
writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
//prep the token counting
tokenCounter.clear(FIELD_A);
tokenCounter.clear(FIELD_B);
//write content
try {
writeContentData(fileId, metadataA, FIELD_A, CONTENTS_TABLE_A);
writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
} catch (IOException e) {
throw new RuntimeException(e);
}
//now run comparisons
if (tokenCounter.getTokenStatistics(FIELD_A).getTotalTokens() > 0 && tokenCounter.getTokenStatistics(FIELD_B).getTotalTokens() > 0) {
Map<Cols, String> data = new HashMap<>();
data.put(Cols.ID, fileId);
ContrastStatistics contrastStatistics = tokenContraster.calculateContrastStatistics(tokenCounter.getTokens(FIELD_A), tokenCounter.getTokenStatistics(FIELD_A), tokenCounter.getTokens(FIELD_B), tokenCounter.getTokenStatistics(FIELD_B));
writeContrasts(data, contrastStatistics);
writer.writeRow(CONTENT_COMPARISONS, data);
}
}
}
//that haven't yet been handled.
if (metadataListB != null) {
for (int i = 0; i < metadataListB.size(); i++) {
if (handledB.contains(i)) {
continue;
}
Metadata metadataB = metadataListB.get(i);
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
writeEmbeddedFilePathData(i, fileId, null, metadataB);
writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
//prep the token counting
tokenCounter.clear(FIELD_B);
//write content
try {
writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
Aggregations