use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OutlookParserTest method testOutlookForwarded.
@Test
public void testOutlookForwarded() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/testMSG_forwarded.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
// Make sure we don't have nested docs
String content = sw.toString();
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OutlookParserTest method testOutlookNew.
/**
* Test case for TIKA-395, to ensure parser works for new Outlook formats.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
*/
@Test
public void testOutlookNew() throws Exception {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook2003.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Welcome to Microsoft Office Outlook 2003", metadata.get(TikaCoreProperties.TITLE));
String content = handler.toString();
assertContains("Outlook 2003", content);
assertContains("Streamlined Mail Experience", content);
assertContains("Navigation Pane", content);
//make sure these are parallel
assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class SXWPFExtractorTest method testMacrosInDocm.
@Test
public void testMacrosInDocm() throws Exception {
Metadata parsedBy = new Metadata();
parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
//test default is "don't extract macros"
List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
for (Metadata metadata : metadataList) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
assertContainsAtLeast(parsedBy, metadataList);
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
officeParserConfig.setUseSAXDocxExtractor(true);
context.set(OfficeParserConfig.class, officeParserConfig);
metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
//check that content came out of the .docm file
assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
assertContainsAtLeast(parsedBy, metadataList);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class SXWPFExtractorTest method setUp.
@Before
public void setUp() {
parseContext = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setUseSAXDocxExtractor(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class Word2006MLParserTest method testSkipDeletedAndMoveFrom.
@Test
public void testSkipDeletedAndMoveFrom() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeDeletedContent(true);
officeParserConfig.setIncludeMoveFromContent(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
XMLResult r = getXML("testWORD_2006ml.xml", pc);
assertContains("frog", r.xml);
assertContainsCount("Second paragraph", r.xml, 2);
}
Aggregations