Search in sources :

Example 71 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class SXSLFExtractorTest method testUnsupportedPowerPoint.

/**
     * For the PowerPoint formats we don't currently support, ensure that
     * we don't break either
     */
@Test
public void testUnsupportedPowerPoint() throws Exception {
    String[] extensions = new String[] { "xps", "thmx" };
    String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
    "application/vnd.openxmlformats-officedocument" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
        ContentHandler handler = new BodyContentHandler();
        try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
            parser.parse(input, handler, metadata, parseContext);
            // Should get the metadata
            assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
        // But that's about it
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 72 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testMasterText.

/**
     * TIKA-712 Master Slide Text from PPT and PPTX files
     * should be extracted too
     */
@Test
public void testMasterText() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OOXMLParserTest.class.getResourceAsStream("/test-documents/testPPT_masterText.pptx")) {
        new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Text that I added to the master slide", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 73 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testExcelXLSB.

@Test
public void testExcelXLSB() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
    // Should be detected correctly
    MediaType type;
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
    }
    // OfficeParser won't handle it
    assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
    // OOXMLParser will (soon) handle it
    assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
    // AutoDetectParser doesn't break on it
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        assertContains("This is an example spreadsheet", content);
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 74 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testNoFormat.

/**
     * TIKA-1044 - Handle word documents where parts of the
     * text have no formatting or styles applied to them
     */
@Test
public void testNoFormat() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.docx")) {
        new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("This is a piece of text that causes an exception", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 75 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class OOXMLParserTest method testProtectedExcelSheets.

/**
     * Documents with some sheets are protected, but not all.
     * See TIKA-364.
     */
@Test
public void testProtectedExcelSheets() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();
    try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/protectedSheets.xlsx")) {
        parser.parse(input, handler, metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20