use of org.apache.tika.eval.db.Cols in project tika by apache.
the class SimpleComparerTest method testGetContent.
@Test
public void testGetContent() throws Exception {
Metadata m = new Metadata();
m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
Map<Cols, String> data = new HashMap<>();
String content = getContent(m, 10, data);
assertEquals(10, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
content = getContent(m, 4, data);
assertEquals(4, content.length());
assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test Metadata with no content
content = getContent(new Metadata(), 10, data);
assertEquals(0, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test null Metadata
content = getContent(null, 10, data);
assertEquals(0, content.length());
assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
}
use of org.apache.tika.eval.db.Cols in project tika by apache.
the class SimpleComparerTest method testAccessException.
@Test
public void testAccessException() throws Exception {
EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), getResourceAsFile("/test-dirs/extractsA/file6_accessEx.pdf.json").toPath());
EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), getResourceAsFile("/test-dirs/extractsB/file6_accessEx.pdf.json").toPath());
comparer.compareFiles(fpsA, fpsB);
for (TableInfo t : new TableInfo[] { ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B }) {
List<Map<Cols, String>> table = writer.getTable(t);
Map<Cols, String> rowA = table.get(0);
//debugPrintRow(rowA);
assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()), rowA.get(Cols.PARSE_EXCEPTION_ID));
assertNull(rowA.get(Cols.ORIG_STACK_TRACE));
assertNull(rowA.get(Cols.SORT_STACK_TRACE));
}
}
use of org.apache.tika.eval.db.Cols in project tika by apache.
the class EvalConsumerBuilder method populateRefTables.
public void populateRefTables() throws IOException, SQLException {
//test for one ref table. If it exists, don't populate ref tables
//TODO: test one at a time
boolean tableExists = false;
try (Connection connection = dbUtil.getConnection()) {
Set<String> tables = dbUtil.getTables(connection);
if (tables.contains(AbstractProfiler.REF_PARSE_ERROR_TYPES.getName().toLowerCase(Locale.US))) {
tableExists = true;
}
} catch (SQLException e) {
//swallow
}
if (tableExists) {
return;
}
IDBWriter writer = getDBWriter(getRefTableInfos());
Map<Cols, String> m = new HashMap<>();
for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
m.clear();
m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
}
for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
m.clear();
m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
}
for (ExtractReaderException.TYPE t : ExtractReaderException.TYPE.values()) {
m.clear();
m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
}
writer.close();
}
use of org.apache.tika.eval.db.Cols in project tika by apache.
the class ExtractProfiler method processFileResource.
@Override
public boolean processFileResource(FileResource fileResource) {
Metadata metadata = fileResource.getMetadata();
EvalFilePaths fps = null;
if (inputDir != null && inputDir.equals(extracts)) {
//crawling an extract dir
fps = getPathsFromExtractCrawl(metadata, extracts);
} else {
fps = getPathsFromSrcCrawl(metadata, inputDir, extracts);
}
int containerId = ID.incrementAndGet();
String containerIdString = Integer.toString(containerId);
ExtractReaderException.TYPE extractExceptionType = null;
List<Metadata> metadataList = null;
try {
metadataList = extractReader.loadExtract(fps.getExtractFile());
} catch (ExtractReaderException e) {
extractExceptionType = e.getType();
}
Map<Cols, String> contOutput = new HashMap<>();
Long srcFileLen = getSourceFileLength(fps, metadataList);
contOutput.put(Cols.LENGTH, srcFileLen > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLen) : "");
contOutput.put(Cols.CONTAINER_ID, containerIdString);
contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString());
if (fps.getExtractFileLength() > 0) {
contOutput.put(Cols.EXTRACT_FILE_LENGTH, (fps.getExtractFile() == null) ? "" : Long.toString(fps.getExtractFileLength()));
}
try {
writer.writeRow(CONTAINER_TABLE, contOutput);
} catch (IOException e) {
throw new RuntimeException(e);
}
if (extractExceptionType != null) {
try {
writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString, fps.getRelativeSourceFilePath().toString(), extractExceptionType);
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
List<Integer> numAttachments = countAttachments(metadataList);
int i = 0;
for (Metadata m : metadataList) {
//the first file should have the same id as the container id
String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
writeProfileData(fps, i, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
writeExceptionData(fileId, m, EXCEPTION_TABLE);
try {
writeContentData(fileId, m, FIELD, CONTENTS_TABLE);
} catch (IOException e) {
throw new RuntimeException(e);
}
i++;
}
return true;
}
use of org.apache.tika.eval.db.Cols in project tika by apache.
the class AbstractProfiler method writeContentData.
/**
* Checks to see if metadata is null or content is empty (null or only whitespace).
* If any of these, then this does no processing, and the fileId is not
* entered into the content table.
*
* @param fileId
* @param m
* @param fieldName
* @param contentsTable
*/
protected void writeContentData(String fileId, Metadata m, String fieldName, TableInfo contentsTable) throws IOException {
if (m == null) {
return;
}
Map<Cols, String> data = new HashMap<>();
String content = getContent(m, maxContentLength, data);
if (content == null || content.trim().length() == 0) {
return;
}
tokenCounter.clear(fieldName);
tokenCounter.add(fieldName, content);
data.put(Cols.ID, fileId);
data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
langid(m, data);
String langid = data.get(Cols.LANG_ID_1);
langid = (langid == null) ? "" : langid;
writeTokenCounts(data, fieldName, tokenCounter);
CommonTokenResult commonTokenResult = null;
try {
commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, tokenCounter.getTokens(fieldName));
} catch (IOException e) {
LOG.error("{}", e.getMessage(), e);
}
data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens()));
data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens()));
data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens()));
data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy()));
SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
data.put(Cols.TOKEN_LENGTH_MEAN, Double.toString(summStats.getMean()));
data.put(Cols.TOKEN_LENGTH_STD_DEV, Double.toString(summStats.getStandardDeviation()));
unicodeBlocks(m, data);
try {
writer.writeRow(contentsTable, data);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
Aggregations