use of javax.xml.transform.sax.TransformerHandler in project tika by apache.
the class OutlookParserTest method testOutlookHTMLfromRTF.
@Test
public void testOutlookHTMLfromRTF() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
// Check the HTML version
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
try (InputStream stream = OutlookParserTest.class.getResourceAsStream("/test-documents/test-outlook2003.msg")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString().replaceAll("<p>\\s+", "<p>");
assertContains("<dd>New Outlook User</dd>", content);
assertContains("designed <i>to help you", content);
assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
// Link - check text around it, and the link itself
assertContains("sign up for a free subscription", content);
assertContains("Office Newsletter", content);
assertContains("newsletter will be sent to you", content);
assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
}
use of javax.xml.transform.sax.TransformerHandler in project sling by apache.
the class SlingTransformer method setXMLConsumer.
@Override
protected void setXMLConsumer(XMLConsumer consumer) {
TransformerHandler transformerHandler;
try {
transformerHandler = this.createTransformerHandler();
} catch (Exception ex) {
throw new RuntimeException("Could not initialize transformer handler.", ex);
}
final Map<String, Object> map = this.getLogicSheetParameters();
if (map != null) {
final Transformer transformer = transformerHandler.getTransformer();
for (Entry<String, Object> entry : map.entrySet()) {
transformer.setParameter(entry.getKey(), entry.getValue());
}
}
final SAXResult result = new SAXResult();
result.setHandler(consumer);
// According to TrAX specs, all TransformerHandlers are LexicalHandlers
result.setLexicalHandler(consumer);
transformerHandler.setResult(result);
super.setXMLConsumer(new XMLConsumerAdapter(transformerHandler, transformerHandler));
}
use of javax.xml.transform.sax.TransformerHandler in project tika by apache.
the class TikaGUI method getHtmlHandler.
/**
* Creates and returns a content handler that turns XHTML input to
* simplified HTML output that can be correctly parsed and displayed
* by {@link JEditorPane}.
* <p>
* The returned content handler is set to output <code>html</code>
* to the given writer. The XHTML namespace is removed from the output
* to prevent the serializer from using the <tag/> empty element
* syntax that causes extra ">" characters to be displayed.
* The <head> tags are dropped to prevent the serializer from
* generating a <META> content type tag that makes
* {@link JEditorPane} fail thinking that the document character set
* is inconsistent.
* <p>
* Additionally, it will use ImageSavingParser to re-write embedded:(image)
* image links to be file:///(temporary file) so that they can be loaded.
*
* @param writer output writer
* @return HTML content handler
* @throws TransformerConfigurationException if an error occurs
*/
private ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.setResult(new StreamResult(writer));
return new ContentHandlerDecorator(handler) {
@Override
public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
if ("img".equals(localName)) {
AttributesImpl newAttrs;
if (atts instanceof AttributesImpl) {
newAttrs = (AttributesImpl) atts;
} else {
newAttrs = new AttributesImpl(atts);
}
for (int i = 0; i < newAttrs.getLength(); i++) {
if ("src".equals(newAttrs.getLocalName(i))) {
String src = newAttrs.getValue(i);
if (src.startsWith("embedded:")) {
String filename = src.substring(src.indexOf(':') + 1);
try {
File img = imageParser.requestSave(filename);
String newSrc = img.toURI().toString();
newAttrs.setValue(i, newSrc);
} catch (IOException e) {
System.err.println("Error creating temp image file " + filename);
// The html viewer will show a broken image too to alert them
}
}
}
}
super.startElement(uri, localName, name, newAttrs);
} else {
super.startElement(uri, localName, name, atts);
}
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
super.endElement(uri, localName, name);
}
}
@Override
public void startPrefixMapping(String prefix, String uri) {
}
@Override
public void endPrefixMapping(String prefix) {
}
};
}
use of javax.xml.transform.sax.TransformerHandler in project tika by apache.
the class TikaGUI method getXmlContentHandler.
private ContentHandler getXmlContentHandler(Writer writer) throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.setResult(new StreamResult(writer));
return handler;
}
use of javax.xml.transform.sax.TransformerHandler in project tika by apache.
the class HtmlParserTest method makeHtmlTransformer.
/**
* Create ContentHandler that transforms SAX events into textual HTML output,
* and writes it out to <writer> - typically this is a StringWriter.
*
* @param writer Where to write resulting HTML text.
* @return ContentHandler suitable for passing to parse() methods.
* @throws Exception
*/
private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
handler.setResult(new StreamResult(writer));
return handler;
}
Aggregations