use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class AbstractOOXMLExtractor method handleEmbeddedOLE.
/**
* Handles an embedded OLE object in the document
*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel, Metadata parentMetadata) throws IOException, SAXException {
// A POIFSFileSystem needs to be at least 3 blocks big to be valid
if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
// Too small, skip
return;
}
InputStream is = part.getInputStream();
// Open the POIFS (OLE2) structure and process
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(part.getInputStream());
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
TikaInputStream stream = null;
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
DirectoryNode root = fs.getRoot();
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (root.hasEntry("CONTENTS") && root.hasEntry("Ole") && root.hasEntry("CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
}
} else if (POIFSDocumentType.OLE10_NATIVE == type) {
// TIKA-704: OLE 1.0 embedded document
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
}
if (ole.getCommand() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
if (ole.getFileName() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
}
byte[] data = ole.getDataBuffer();
if (data != null) {
stream = TikaInputStream.get(data);
}
if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
}
} else {
handleEmbeddedFile(part, handler, rel);
}
} catch (FileNotFoundException e) {
// There was no CONTENTS entry, so skip this part
} catch (Ole10NativeException e) {
// Could not process an OLE 1.0 entry, so skip this part
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
} finally {
if (fs != null) {
fs.close();
}
if (stream != null) {
stream.close();
}
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class AbstractOOXMLExtractor method handleEmbeddedFile.
/**
* Handles an embedded file in the document
*/
protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) throws SAXException, IOException {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
// Get the name
String name = part.getPartName().getName();
metadata.set(Metadata.RESOURCE_NAME_KEY, name.substring(name.lastIndexOf('/') + 1));
// Get the content type
metadata.set(Metadata.CONTENT_TYPE, part.getContentType());
// Call the recursing handler
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
try (TikaInputStream tis = TikaInputStream.get(part.getInputStream())) {
embeddedExtractor.parseEmbedded(tis, new EmbeddedContentHandler(handler), metadata, false);
}
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class Word2006MLParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//set OfficeParserConfig if the user hasn't specified one
configure(context);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context))));
} catch (SAXException e) {
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class AbstractXML2003Parser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
setContentType(metadata);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class OpenDocumentParser method handleZipEntry.
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, SAXException, TikaException {
if (entry == null)
return;
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else {
String embeddedName = entry.getName();
//scrape everything under Thumbnails/ and Pictures/
if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) {
EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
/* if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
}*/
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
}
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(zip, new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
}
}
}
Aggregations