use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class OpenDocumentParser method handleZipEntry.
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, SAXException, TikaException {
if (entry == null)
return;
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else {
String embeddedName = entry.getName();
//scrape everything under Thumbnails/ and Pictures/
if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) {
EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
/* if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
}*/
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
}
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(zip, new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
}
}
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class BinaryDataHandler method endPart.
@Override
public void endPart() throws SAXException, TikaException {
if (hasData()) {
EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
Metadata embeddedMetadata = new Metadata();
try (TikaInputStream stream = TikaInputStream.get(getInputStream())) {
embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false);
} catch (IOException e) {
throw new TikaException("error in finishing part", e);
}
buffer.setLength(0);
}
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class CompressorParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// should not be closed
if (stream.markSupported()) {
stream = new CloseShieldInputStream(stream);
} else {
// Ensure that the stream supports the mark feature
stream = new BufferedInputStream(new CloseShieldInputStream(stream));
}
CompressorInputStream cis;
try {
CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() {
public boolean decompressConcatenated(Metadata metadata) {
return false;
}
});
CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
cis = factory.createCompressorInputStream(stream);
} catch (CompressorException e) {
if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
throw new TikaMemoryLimitException(e.getMessage());
}
throw new TikaException("Unable to uncompress document stream", e);
}
MediaType type = getMediaType(cis);
if (!type.equals(MediaType.OCTET_STREAM)) {
metadata.set(CONTENT_TYPE, type.toString());
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
Metadata entrydata = new Metadata();
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
if (name.endsWith(".tbz")) {
name = name.substring(0, name.length() - 4) + ".tar";
} else if (name.endsWith(".tbz2")) {
name = name.substring(0, name.length() - 5) + ".tar";
} else if (name.endsWith(".bz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".bz2")) {
name = name.substring(0, name.length() - 4);
} else if (name.endsWith(".xz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".zlib")) {
name = name.substring(0, name.length() - 5);
} else if (name.endsWith(".pack")) {
name = name.substring(0, name.length() - 5);
} else if (name.length() > 0) {
name = GzipUtils.getUncompressedFilename(name);
}
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(cis, xhtml, entrydata, true);
}
} finally {
cis.close();
}
xhtml.endDocument();
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class PackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}
// Ensure that the stream supports the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
TemporaryResources tmp = new TemporaryResources();
ArchiveInputStream ais = null;
try {
ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
// Seven Zip suports passwords, was one given?
String password = null;
PasswordProvider provider = context.get(PasswordProvider.class);
if (provider != null) {
password = provider.getPassword(metadata);
}
SevenZFile sevenz;
if (password == null) {
sevenz = new SevenZFile(tstream.getFile());
} else {
sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
}
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, metadata, xhtml);
}
entry = ais.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
// If it's an encrypted document of unknown password, report as such
if (zfe.getFeature() == Feature.ENCRYPTION) {
throw new EncryptedDocumentException(zfe);
}
// Otherwise throw the exception
throw new TikaException("UnsupportedZipFeature", zfe);
} catch (PasswordRequiredException pre) {
throw new EncryptedDocumentException(pre);
} finally {
ais.close();
tmp.close();
}
xhtml.endDocument();
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class AppleSingleFileParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
short numEntries = readThroughNumEntries(stream);
long bytesRead = 26;
List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries);
bytesRead += 12 * numEntries;
Metadata embeddedMetadata = new Metadata();
bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead);
FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
if (contentFieldInfo != null) {
long diff = contentFieldInfo.offset - bytesRead;
IOUtils.skipFully(stream, diff);
if (ex.shouldParseEmbedded(embeddedMetadata)) {
// TODO: we should probably add a readlimiting wrapper around this
// stream to ensure that not more than contentFieldInfo.length bytes
// are read
ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml, embeddedMetadata, false);
}
}
xhtml.endDocument();
}
Aggregations