Search in sources :

Example 46 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class TestContainerAwareDetector method testOpenContainer.

@Test
public void testOpenContainer() throws Exception {
    try (TikaInputStream stream = TikaInputStream.get(TestContainerAwareDetector.class.getResource("/test-documents/testPPT.ppt"))) {
        assertNull(stream.getOpenContainer());
        assertEquals(MediaType.parse("application/vnd.ms-powerpoint"), detector.detect(stream, new Metadata()));
        assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
    }
}
Also used : NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) Test(org.junit.Test)

Example 47 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class TestContainerAwareDetector method getTruncatedFile.

private TikaInputStream getTruncatedFile(String name, int n) throws IOException {
    try (InputStream input = TestContainerAwareDetector.class.getResourceAsStream("/test-documents/" + name)) {
        byte[] bytes = new byte[n];
        int m = 0;
        while (m < bytes.length) {
            int i = input.read(bytes, m, bytes.length - m);
            if (i != -1) {
                m += i;
            } else {
                throw new IOException("Unexpected end of stream");
            }
        }
        return TikaInputStream.get(bytes);
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException)

Example 48 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class TestContainerAwareDetector method assertRemovalTempfiles.

private void assertRemovalTempfiles(String fileName) throws Exception {
    int numberOfTempFiles = countTemporaryFiles();
    try (TikaInputStream stream = TikaInputStream.get(TestContainerAwareDetector.class.getResource("/test-documents/" + fileName))) {
        detector.detect(stream, new Metadata());
    }
    assertEquals(numberOfTempFiles, countTemporaryFiles());
}
Also used : Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream)

Example 49 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class RFC822ParserTest method testGetAttachmentsAsEmbeddedResources.

/**
     * TIKA-1222 When requested, ensure that the various attachments of
     * the mail come through properly as embedded resources
     */
@Test
public void testGetAttachmentsAsEmbeddedResources() throws Exception {
    TrackingHandler tracker = new TrackingHandler();
    ContainerExtractor ex = new ParserContainerExtractor();
    try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
        assertEquals(true, ex.isSupported(tis));
        ex.extract(tis, ex, tracker);
    }
    // Check we found all 3 parts
    assertEquals(3, tracker.filenames.size());
    assertEquals(3, tracker.mediaTypes.size());
    // No filenames available
    assertEquals(null, tracker.filenames.get(0));
    assertEquals(null, tracker.filenames.get(1));
    assertEquals(null, tracker.filenames.get(2));
    // Types are available
    assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
    assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
    assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 50 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class SQLite3ParserTest method testParserContainerExtractor.

@Test
public void testParserContainerExtractor() throws Exception {
    //There should be 6 embedded documents:
    //2x tables -- UTF-8 csv representations of the tables
    //2x word files, one doc and one docx
    //2x png files, the same image embedded in each of the doc and docx
    ParserContainerExtractor ex = new ParserContainerExtractor();
    ByteCopyingHandler byteCopier = new ByteCopyingHandler();
    Metadata metadata = new Metadata();
    try (TikaInputStream is = TikaInputStream.get(getResourceAsStream(TEST_FILE1))) {
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        ex.extract(is, ex, byteCopier);
    }
    assertEquals(4, byteCopier.bytes.size());
    String[] strings = new String[4];
    for (int i = 1; i < byteCopier.bytes.size(); i++) {
        byte[] byteArr = byteCopier.bytes.get(i);
        String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8);
        strings[i] = s;
    }
    byte[] oleBytes = new byte[] { (byte) -48, (byte) -49, (byte) 17, (byte) -32, (byte) -95, (byte) -79, (byte) 26, (byte) -31, (byte) 0, (byte) 0 };
    //test OLE
    for (int i = 0; i < 10; i++) {
        assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]);
    }
    assertContains("PNG", strings[1]);
    assertContains("PK", strings[2]);
    assertContains("PNG", strings[3]);
}
Also used : Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6