use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testNoFormat.
/**
* TIKA-1044 - Handle word documents where parts of the
* text have no formatting or styles applied to them
*/
@Test
public void testNoFormat() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = WordParserTest.class.getResourceAsStream("/test-documents/testWORD_no_format.docx")) {
new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("This is a piece of text that causes an exception", content);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testProtectedExcelSheets.
/**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
*/
@Test
public void testProtectedExcelSheets() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/protectedSheets.xlsx")) {
parser.parse(input, handler, metadata, context);
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testPowerPointMetadataEarly.
/**
* Test that the metadata is already extracted when the body is processed.
* See TIKA-1109
*/
@Test
public void testPowerPointMetadataEarly() throws Exception {
String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
final String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
final String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
final Metadata metadata = new Metadata();
// Allow the value to be access from the inner class
final int currentI = i;
ContentHandler handler = new BodyContentHandler() {
public void startDocument() {
assertEquals("Mime-type checking for " + filename, mimeTypes[currentI], metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
}
};
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument(filename)) {
parser.parse(input, handler, metadata, context);
}
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testUnsupportedPowerPoint.
/**
* For the PowerPoint formats we don't currently support, ensure that
* we don't break either
*/
@Test
public void testUnsupportedPowerPoint() throws Exception {
String[] extensions = new String[] { "xps", "thmx" };
String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
"application/vnd.openxmlformats-officedocument" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument(filename)) {
parser.parse(input, handler, metadata, context);
// Should get the metadata
assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
// But that's about it
}
}
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class OOXMLParserTest method testEmbeddedPDF.
// TIKA-989:
@Test
public void testEmbeddedPDF() throws Exception {
Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
handler.setResult(new StreamResult(sw));
try (InputStream input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_embedded_pdf.docx")) {
new OOXMLParser().parse(input, handler, metadata, new ParseContext());
}
String xml = sw.toString();
int i = xml.indexOf("Here is the pdf file:");
int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>");
int k = xml.indexOf("Bye Bye");
int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>");
int m = xml.indexOf("Bye for real.");
assertTrue(i != -1);
assertTrue(j != -1);
assertTrue(k != -1);
assertTrue(l != -1);
assertTrue(m != -1);
assertTrue(i < j);
assertTrue(j < k);
assertTrue(k < l);
assertTrue(l < m);
}
Aggregations