use of org.apache.tika.TikaTest in project tika by apache.
the class PDFParserTest method testSkipBadPage.
@Test
public void testSkipBadPage() throws Exception {
//test file comes from govdocs1
//can't use TikaTest shortcuts because of exception
Parser p = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(-1);
Metadata m = new Metadata();
ParseContext context = new ParseContext();
boolean tikaEx = false;
try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
p.parse(is, handler, m, context);
} catch (TikaException e) {
tikaEx = true;
}
String content = handler.toString();
assertTrue("Should have thrown exception", tikaEx);
assertEquals(1, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
assertContains("Unknown dir", m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
assertContains("1309.61", content);
//now try throwing exception immediately
PDFParserConfig config = new PDFParserConfig();
config.setCatchIntermediateIOExceptions(false);
context.set(PDFParserConfig.class, config);
handler = new BodyContentHandler(-1);
m = new Metadata();
tikaEx = false;
try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
p.parse(is, handler, m, context);
} catch (TikaException e) {
tikaEx = true;
}
content = handler.toString();
assertTrue("Should have thrown exception", tikaEx);
assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
assertNotContained("1309.61", content);
}
Aggregations