use of org.xml.sax.ContentHandler in project tika by apache.
the class HtmlParserTest method testParseEmpty.
@Test
public void testParseEmpty() throws Exception {
ContentHandler handler = new BodyContentHandler();
new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(), new ParseContext());
assertEquals("", handler.toString());
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class HtmlParserTest method testBoilerplateWithMarkup.
/**
* Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
*/
@Test
public void testBoilerplateWithMarkup() throws Exception {
String path = "/test-documents/boilerplate.html";
Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
ContentHandler ch = makeHtmlTransformer(sw);
BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
bpch.setIncludeMarkup(true);
new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream(path), bpch, metadata, new ParseContext());
String content = sw.toString();
assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
assertTrue("Has real content", content.contains("<p>This is the real meat"));
assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
assertFalse(content.contains("boilerplate"));
assertFalse(content.contains("footer"));
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class Pkcs7ParserTest method testDetachedSignature.
public void testDetachedSignature() throws Exception {
try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream("/test-documents/testDetached.p7s")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
} catch (NullPointerException npe) {
fail("should not get NPE");
} catch (TikaException te) {
assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ForkParserIntegrationTest method testParsingErrorInForkedParserShouldBeReported.
/**
* TIKA-831 Parsers throwing errors should be caught and
* properly reported
*/
@Test
public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
BrokenParser brokenParser = new BrokenParser();
ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
// With a serializable error, we'll get that back
try {
ContentHandler output = new BodyContentHandler();
ParseContext context = new ParseContext();
parser.parse(stream, output, new Metadata(), context);
fail("Expected TikaException caused by Error");
} catch (TikaException e) {
assertEquals(brokenParser.err, e.getCause());
} finally {
parser.close();
}
// With a non serializable one, we'll get something else
// TODO Fix this test
brokenParser = new BrokenParser();
brokenParser.re = new WontBeSerializedError("Can't Serialize");
parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
// try {
// ContentHandler output = new BodyContentHandler();
// ParseContext context = new ParseContext();
// parser.parse(stream, output, new Metadata(), context);
// fail("Expected TikaException caused by Error");
// } catch (TikaException e) {
// assertEquals(TikaException.class, e.getCause().getClass());
// assertEquals("Bang!", e.getCause().getMessage());
// }
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ForkParserIntegrationTest method testParserHandlingOfNonSerializable.
/**
* If we supply a non serializable object on the ParseContext,
* check we get a helpful exception back
*/
@Test
public void testParserHandlingOfNonSerializable() throws Exception {
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
ParseContext context = new ParseContext();
context.set(Detector.class, new Detector() {
public MediaType detect(InputStream input, Metadata metadata) {
return MediaType.OCTET_STREAM;
}
});
try {
ContentHandler output = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
parser.parse(stream, output, new Metadata(), context);
fail("Should have blown up with a non serializable ParseContext");
} catch (TikaException e) {
// Check the right details
assertNotNull(e.getCause());
assertEquals(NotSerializableException.class, e.getCause().getClass());
assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
} finally {
parser.close();
}
}
Aggregations