Search in sources :

Example 56 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OutlookParserTest method testOutlookForwarded.

@Test
public void testOutlookForwarded() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    // Check the HTML version
    StringWriter sw = new StringWriter();
    SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
    TransformerHandler handler = factory.newTransformerHandler();
    handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
    handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
    handler.setResult(new StreamResult(sw));
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/testMSG_forwarded.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    // Make sure we don't have nested docs
    String content = sw.toString();
    assertEquals(2, content.split("<body>").length);
    assertEquals(2, content.split("<\\/body>").length);
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) StringWriter(java.io.StringWriter) StreamResult(javax.xml.transform.stream.StreamResult) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 57 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class OutlookParserTest method testOutlookNew.

/**
     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
     */
@Test
public void testOutlookNew() throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook2003.msg")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Welcome to Microsoft Office Outlook 2003", metadata.get(TikaCoreProperties.TITLE));
    String content = handler.toString();
    assertContains("Outlook 2003", content);
    assertContains("Streamlined Mail Experience", content);
    assertContains("Navigation Pane", content);
    //make sure these are parallel
    assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
    assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
    assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 58 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class SXWPFExtractorTest method testMacrosInDocm.

@Test
public void testMacrosInDocm() throws Exception {
    Metadata parsedBy = new Metadata();
    parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
    //test default is "don't extract macros"
    List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
    for (Metadata metadata : metadataList) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    assertContainsAtLeast(parsedBy, metadataList);
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    officeParserConfig.setUseSAXDocxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
    //check that content came out of the .docm file
    assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
    assertContainsAtLeast(parsedBy, metadataList);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 59 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class SXWPFExtractorTest method setUp.

@Before
public void setUp() {
    parseContext = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setUseSAXDocxExtractor(true);
    parseContext.set(OfficeParserConfig.class, officeParserConfig);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) Before(org.junit.Before)

Example 60 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class Word2006MLParserTest method testSkipDeletedAndMoveFrom.

@Test
public void testSkipDeletedAndMoveFrom() throws Exception {
    ParseContext pc = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setIncludeDeletedContent(true);
    officeParserConfig.setIncludeMoveFromContent(true);
    pc.set(OfficeParserConfig.class, officeParserConfig);
    XMLResult r = getXML("testWORD_2006ml.xml", pc);
    assertContains("frog", r.xml);
    assertContainsCount("Second paragraph", r.xml, 2);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)338 Metadata (org.apache.tika.metadata.Metadata)283 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)164 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)118 Parser (org.apache.tika.parser.Parser)109 ByteArrayInputStream (java.io.ByteArrayInputStream)92 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)30 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)25 SAXException (org.xml.sax.SAXException)25 CompositeParser (org.apache.tika.parser.CompositeParser)22 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)20