use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class XmlRootExtractor method extractRootElement.
/**
* @since Apache Tika 0.9
*/
public QName extractRootElement(InputStream stream) {
ExtractorHandler handler = new ExtractorHandler();
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
factory.setValidating(false);
try {
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
} catch (SAXNotRecognizedException e) {
// TIKA-271 and TIKA-1000: Some XML parsers do not support the secure-processing
// feature, even though it's required by JAXP in Java 5. Ignoring
// the exception is fine here, deployments without this feature
// are inherently vulnerable to XML denial-of-service attacks.
}
factory.newSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(handler));
} catch (Exception ignore) {
}
return handler.rootElement;
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class DIFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// TODO Auto-generated method stub
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class EpubContentParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
SAXParser parser = context.getSAXParser();
parser.parse(new CloseShieldInputStream(stream), new OfflineContentHandler(handler));
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class Word2006MLParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//set OfficeParserConfig if the user hasn't specified one
configure(context);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context))));
} catch (SAXException e) {
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
}
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class AbstractXML2003Parser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
setContentType(metadata);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
}
}
Aggregations