use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class RTFEmbObjHandler method extractObj.
private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException {
if (bytes == null) {
return;
}
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
TikaInputStream stream = TikaInputStream.get(bytes);
if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
String extension = embeddedDocumentUtil.getExtension(stream, metadata);
if (inObject && state == EMB_STATE.PICT) {
metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension);
metadata.set(RTFMetadata.THUMBNAIL, "true");
} else {
metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + extension);
}
}
try {
embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
} finally {
stream.close();
}
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class DIFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// TODO Auto-generated method stub
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class OfficeParser method parse.
protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
// Parse summary entries first, to make metadata available early
new SummaryExtractor(metadata).parseSummaries(root);
// Parse remaining document entries
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (type != POIFSDocumentType.UNKNOWN) {
setType(metadata, type.getType());
}
switch(type) {
case SOLIDWORKS_PART:
case SOLIDWORKS_ASSEMBLY:
case SOLIDWORKS_DRAWING:
break;
case PUBLISHER:
PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
xhtml.element("p", publisherTextExtractor.getText());
break;
case WORDDOCUMENT:
new WordExtractor(context, metadata).parse(root, xhtml);
break;
case POWERPOINT:
new HSLFExtractor(context, metadata).parse(root, xhtml);
break;
case WORKBOOK:
case XLR:
Locale locale = context.get(Locale.class, Locale.getDefault());
new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
break;
case PROJECT:
// We currently can't do anything beyond the metadata
break;
case VISIO:
VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
for (String text : visioTextExtractor.getAllText()) {
xhtml.element("p", text);
}
break;
case OUTLOOK:
OutlookExtractor extractor = new OutlookExtractor(root, context);
extractor.parse(xhtml, metadata);
break;
case ENCRYPTED:
EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info);
try {
// By default, use the default Office Password
String password = Decryptor.DEFAULT_PASSWORD;
// If they supplied a Password Provider, ask that for the password,
// and use the provider given one if available (stick with default if not)
PasswordProvider passwordProvider = context.get(PasswordProvider.class);
if (passwordProvider != null) {
String suppliedPassword = passwordProvider.getPassword(metadata);
if (suppliedPassword != null) {
password = suppliedPassword;
}
}
// Check if we've the right password or not
if (!d.verifyPassword(password)) {
throw new EncryptedDocumentException();
}
// Decrypt the OLE2 stream, and delegate the resulting OOXML
// file to the regular OOXML parser for normal handling
OOXMLParser parser = new OOXMLParser();
parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context);
} catch (GeneralSecurityException ex) {
throw new EncryptedDocumentException(ex);
}
default:
// is extracted, which happened above
break;
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class AbstractOOXMLExtractor method handleThumbnail.
private void handleThumbnail(ContentHandler handler) {
try {
OPCPackage opcPackage = extractor.getPackage();
for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
PackagePart tPart = opcPackage.getPart(rel);
InputStream tStream = tPart.getInputStream();
Metadata thumbnailMetadata = new Metadata();
String thumbName = tPart.getPartName().getName();
thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded");
attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName);
handler.startElement(XHTML, "div", "div", attributes);
handler.endElement(XHTML, "div", "div");
thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName);
thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType());
thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName());
if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false);
}
tStream.close();
}
} catch (Exception ex) {
}
}
use of org.apache.tika.sax.EmbeddedContentHandler in project tika by apache.
the class EMFParser method handleWMF.
private void handleWMF(HemfCommentPublic.WindowsMetafile comment, ContentHandler contentHandler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(comment.getWmfInputStream())) {
embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(contentHandler), embeddedMetadata, false);
}
}
}
Aggregations