use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class XMLParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class TesseractOCRParser method extractHOCROutput.
private void extractHOCROutput(InputStream is, ParseContext parseContext, XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
if (parseContext == null) {
parseContext = new ParseContext();
}
SAXParser parser = parseContext.getSAXParser();
xhtml.startElement("div", "class", "ocr");
parser.parse(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml)));
xhtml.endElement("div");
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class SXSLFPowerPointExtractorDecorator method handleSlidePart.
private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException {
Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false, metadata);
// Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
xhtml.startElement("div", "class", "slide-content");
try (InputStream stream = slidePart.getInputStream()) {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));
} catch (TikaException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
xhtml.endElement("div");
handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(), "slide-master-content", slidePart, new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)));
handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(), "slide-notes", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(), "slide-notes-master", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(), null, slidePart, new XSLFCommentsHandler(xhtml));
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class SXSLFPowerPointExtractorDecorator method handleBasicRelatedParts.
/**
* This should handle the comments, master, notes, etc
*
* @param contentType
* @param xhtmlClassLabel
* @param parentPart
* @param contentHandler
*/
private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel, PackagePart parentPart, ContentHandler contentHandler) throws SAXException {
PackageRelationshipCollection relatedPartPRC = null;
try {
relatedPartPRC = parentPart.getRelationshipsByType(contentType);
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
contentHandler.startElement("", "div", "div", attributes);
for (int i = 0; i < relatedPartPRC.size(); i++) {
PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
try {
PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
try (InputStream stream = relatedPartPart.getInputStream()) {
context.getSAXParser().parse(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
} catch (IOException | TikaException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
}
contentHandler.endElement("", "div", "div");
}
}
use of org.apache.tika.sax.OfflineContentHandler in project tika by apache.
the class Word2006MLParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//set OfficeParserConfig if the user hasn't specified one
configure(context);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context))));
} catch (SAXException e) {
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
}
}
Aggregations