Search in sources :

Example 21 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class DigestingParserTest method testMulti.

private void testMulti(Path tmp, int fileLength, int markLimit, boolean useTikaInputStream) throws IOException {
    OutputStream os = new BufferedOutputStream(Files.newOutputStream(tmp, StandardOpenOption.CREATE));
    for (int i = 0; i < fileLength; i++) {
        os.write(random.nextInt());
    }
    os.flush();
    os.close();
    Metadata truth = new Metadata();
    addTruth(tmp, CommonsDigester.DigestAlgorithm.MD5, truth);
    addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA1, truth);
    addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA512, truth);
    checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA512, CommonsDigester.DigestAlgorithm.SHA1, CommonsDigester.DigestAlgorithm.MD5);
    checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA1);
    checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA1, CommonsDigester.DigestAlgorithm.SHA512, CommonsDigester.DigestAlgorithm.MD5);
    checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.SHA1);
    checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream, CommonsDigester.DigestAlgorithm.MD5);
}
Also used : OutputStream(java.io.OutputStream) BufferedOutputStream(java.io.BufferedOutputStream) Metadata(org.apache.tika.metadata.Metadata) BufferedOutputStream(java.io.BufferedOutputStream)

Example 22 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class DigestingParserTest method testReset.

@Test
public void testReset() throws Exception {
    String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
    Metadata m = new Metadata();
    XMLResult xml = getXML("test_recursive_embedded.docx", new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
    assertEquals(expectedMD5, m.get(P + "MD5"));
}
Also used : Metadata(org.apache.tika.metadata.Metadata) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 23 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class DigestingParserTest method testBasic.

@Test
public void testBasic() throws Exception {
    Map<CommonsDigester.DigestAlgorithm, String> expected = new HashMap<>();
    expected.put(CommonsDigester.DigestAlgorithm.MD2, "d768c8e27b0b52c6eaabfaa7122d1d4f");
    expected.put(CommonsDigester.DigestAlgorithm.MD5, "59f626e09a8c16ab6dbc2800c685f772");
    expected.put(CommonsDigester.DigestAlgorithm.SHA1, "7a1f001d163ac90d8ea54c050faf5a38079788a6");
    expected.put(CommonsDigester.DigestAlgorithm.SHA256, "c4b7fab030a8b6a9d6691f6699ac8e6f" + "82bc53764a0f1430d134ae3b70c32654");
    expected.put(CommonsDigester.DigestAlgorithm.SHA384, "ebe368b9326fef44408290724d187553" + "8b8a6923fdf251ddab72c6e4b5d54160" + "9db917ba4260d1767995a844d8d654df");
    expected.put(CommonsDigester.DigestAlgorithm.SHA512, "ee46d973ee1852c018580c242955974d" + "da4c21f36b54d7acd06fcf68e974663b" + "fed1d256875be58d22beacf178154cc3" + "a1178cb73443deaa53aa0840324708bb");
    //test each one
    for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
        Metadata m = new Metadata();
        XMLResult xml = getXML("test_recursive_embedded.docx", new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
        assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
    }
    //test comma separated
    CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
    Metadata m = new Metadata();
    XMLResult xml = getXML("test_recursive_embedded.docx", new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
    for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[] { CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256, CommonsDigester.DigestAlgorithm.SHA384, CommonsDigester.DigestAlgorithm.SHA512 }) {
        assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
    }
    assertNull(m.get(P + CommonsDigester.DigestAlgorithm.MD2.toString()));
    assertNull(m.get(P + CommonsDigester.DigestAlgorithm.SHA1.toString()));
}
Also used : HashMap(java.util.HashMap) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 24 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ParsingReaderTest method testMetadata.

/**
     * Test case for TIKA-203
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
     */
@Test
public void testMetadata() throws Exception {
    Metadata metadata = new Metadata();
    InputStream stream = ParsingReaderTest.class.getResourceAsStream("/test-documents/testEXCEL.xls");
    try (Reader reader = new ParsingReader(new AutoDetectParser(), stream, metadata, new ParseContext())) {
        // Metadata should already be available
        assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
        // Check that the internal buffering isn't broken
        assertEquals('F', (char) reader.read());
        assertEquals('e', (char) reader.read());
        assertEquals('u', (char) reader.read());
        assertEquals('i', (char) reader.read());
        assertEquals('l', (char) reader.read());
        assertEquals('1', (char) reader.read());
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Reader(java.io.Reader) Test(org.junit.Test)

Example 25 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaEncodingDetectorTest method testEncodingDetectorConfigurability.

@Test
public void testEncodingDetectorConfigurability() throws Exception {
    TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml"));
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    try {
        Metadata metadata = getXML("english.cp500.txt", p).metadata;
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
    Tika tika = new Tika(tikaConfig);
    try {
        String txt = tika.parseToString(getResourceAsFile("/test-documents/english.cp500.txt"));
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Tika(org.apache.tika.Tika) Test(org.junit.Test)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)643 Test (org.junit.Test)467 InputStream (java.io.InputStream)318 ParseContext (org.apache.tika.parser.ParseContext)281 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)268 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)228 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)151 ByteArrayInputStream (java.io.ByteArrayInputStream)141 Parser (org.apache.tika.parser.Parser)134 TikaInputStream (org.apache.tika.io.TikaInputStream)131 IOException (java.io.IOException)62 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)46 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)28 FileInputStream (java.io.FileInputStream)27 MediaType (org.apache.tika.mime.MediaType)27