use of org.apache.tika.metadata.Metadata in project tika by apache.
the class DigestingParserTest method testMulti.
private void testMulti(Path tmp, int fileLength, int markLimit, boolean useTikaInputStream) throws IOException {
OutputStream os = new BufferedOutputStream(Files.newOutputStream(tmp, StandardOpenOption.CREATE));
for (int i = 0; i < fileLength; i++) {
os.write(random.nextInt());
}
os.flush();
os.close();
Metadata truth = new Metadata();
addTruth(tmp, CommonsDigester.DigestAlgorithm.MD5, truth);
addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA1, truth);
addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA512, truth);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA512, CommonsDigester.DigestAlgorithm.SHA1, CommonsDigester.DigestAlgorithm.MD5);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA1);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA1, CommonsDigester.DigestAlgorithm.SHA512, CommonsDigester.DigestAlgorithm.MD5);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA1);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.MD5);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class DigestingParserTest method testReset.
@Test
public void testReset() throws Exception {
String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx", new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
assertEquals(expectedMD5, m.get(P + "MD5"));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class DigestingParserTest method testBasic.
@Test
public void testBasic() throws Exception {
Map<CommonsDigester.DigestAlgorithm, String> expected = new HashMap<>();
expected.put(CommonsDigester.DigestAlgorithm.MD2, "d768c8e27b0b52c6eaabfaa7122d1d4f");
expected.put(CommonsDigester.DigestAlgorithm.MD5, "59f626e09a8c16ab6dbc2800c685f772");
expected.put(CommonsDigester.DigestAlgorithm.SHA1, "7a1f001d163ac90d8ea54c050faf5a38079788a6");
expected.put(CommonsDigester.DigestAlgorithm.SHA256, "c4b7fab030a8b6a9d6691f6699ac8e6f" + "82bc53764a0f1430d134ae3b70c32654");
expected.put(CommonsDigester.DigestAlgorithm.SHA384, "ebe368b9326fef44408290724d187553" + "8b8a6923fdf251ddab72c6e4b5d54160" + "9db917ba4260d1767995a844d8d654df");
expected.put(CommonsDigester.DigestAlgorithm.SHA512, "ee46d973ee1852c018580c242955974d" + "da4c21f36b54d7acd06fcf68e974663b" + "fed1d256875be58d22beacf178154cc3" + "a1178cb73443deaa53aa0840324708bb");
//test each one
for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx", new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
}
//test comma separated
CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx", new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[] { CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256, CommonsDigester.DigestAlgorithm.SHA384, CommonsDigester.DigestAlgorithm.SHA512 }) {
assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
}
assertNull(m.get(P + CommonsDigester.DigestAlgorithm.MD2.toString()));
assertNull(m.get(P + CommonsDigester.DigestAlgorithm.SHA1.toString()));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class ParsingReaderTest method testMetadata.
/**
* Test case for TIKA-203
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
*/
@Test
public void testMetadata() throws Exception {
Metadata metadata = new Metadata();
InputStream stream = ParsingReaderTest.class.getResourceAsStream("/test-documents/testEXCEL.xls");
try (Reader reader = new ParsingReader(new AutoDetectParser(), stream, metadata, new ParseContext())) {
// Metadata should already be available
assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
// Check that the internal buffering isn't broken
assertEquals('F', (char) reader.read());
assertEquals('e', (char) reader.read());
assertEquals('u', (char) reader.read());
assertEquals('i', (char) reader.read());
assertEquals('l', (char) reader.read());
assertEquals('1', (char) reader.read());
}
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaEncodingDetectorTest method testEncodingDetectorConfigurability.
@Test
public void testEncodingDetectorConfigurability() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
Tika tika = new Tika(tikaConfig);
try {
String txt = tika.parseToString(getResourceAsFile("/test-documents/english.cp500.txt"));
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
Aggregations