Search in sources :

Example 96 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OOXMLParserTest method testNoFormat.

/**
     * TIKA-1044 - Handle word documents where parts of the
     * text have no formatting or styles applied to them
     */
@Test
public void testNoFormat() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.docx")) {
        new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("This is a piece of text that causes an exception", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 97 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OOXMLParserTest method testProtectedExcelSheets.

/**
     * Documents with some sheets are protected, but not all.
     * See TIKA-364.
     */
@Test
public void testProtectedExcelSheets() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();
    try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/protectedSheets.xlsx")) {
        parser.parse(input, handler, metadata, context);
        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 98 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OOXMLParserTest method testPowerPointMetadataEarly.

/**
     * Test that the metadata is already extracted when the body is processed.
     * See TIKA-1109
     */
@Test
public void testPowerPointMetadataEarly() throws Exception {
    String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
    final String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        final String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        final Metadata metadata = new Metadata();
        // Allow the value to be access from the inner class
        final int currentI = i;
        ContentHandler handler = new BodyContentHandler() {

            public void startDocument() {
                assertEquals("Mime-type checking for " + filename, mimeTypes[currentI], metadata.get(Metadata.CONTENT_TYPE));
                assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
                assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
                assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
            }
        };
        ParseContext context = new ParseContext();
        try (InputStream input = getTestDocument(filename)) {
            parser.parse(input, handler, metadata, context);
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 99 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OOXMLParserTest method testUnsupportedPowerPoint.

/**
     * For the PowerPoint formats we don't currently support, ensure that
     * we don't break either
     */
@Test
public void testUnsupportedPowerPoint() throws Exception {
    String[] extensions = new String[] { "xps", "thmx" };
    String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
    "application/vnd.openxmlformats-officedocument" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        try (InputStream input = getTestDocument(filename)) {
            parser.parse(input, handler, metadata, context);
            // Should get the metadata
            assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
        // But that's about it
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 100 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OOXMLParserTest method testEmbeddedPDF.

// TIKA-989:
@Test
public void testEmbeddedPDF() throws Exception {
    Metadata metadata = new Metadata();
    StringWriter sw = new StringWriter();
    SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
    TransformerHandler handler = factory.newTransformerHandler();
    handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
    handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
    handler.setResult(new StreamResult(sw));
    try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_embedded_pdf.docx")) {
        new OOXMLParser().parse(input, handler, metadata, new ParseContext());
    }
    String xml = sw.toString();
    int i = xml.indexOf("Here is the pdf file:");
    int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
    int k = xml.indexOf("Bye Bye");
    int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
    int m = xml.indexOf("Bye for real.");
    assertTrue(i != -1);
    assertTrue(j != -1);
    assertTrue(k != -1);
    assertTrue(l != -1);
    assertTrue(m != -1);
    assertTrue(i < j);
    assertTrue(j < k);
    assertTrue(k < l);
    assertTrue(l < m);
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) StringWriter(java.io.StringWriter) StreamResult(javax.xml.transform.stream.StreamResult) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) ParseContext(org.apache.tika.parser.ParseContext) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)336 Metadata (org.apache.tika.metadata.Metadata)281 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)163 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)117 Parser (org.apache.tika.parser.Parser)107 ByteArrayInputStream (java.io.ByteArrayInputStream)91 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)29 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)24 SAXException (org.xml.sax.SAXException)24 CompositeParser (org.apache.tika.parser.CompositeParser)22 FileInputStream (java.io.FileInputStream)19