use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class PackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}
// Ensure that the stream supports the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
TemporaryResources tmp = new TemporaryResources();
ArchiveInputStream ais = null;
try {
ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
// Seven Zip suports passwords, was one given?
String password = null;
PasswordProvider provider = context.get(PasswordProvider.class);
if (provider != null) {
password = provider.getPassword(metadata);
}
SevenZFile sevenz;
if (password == null) {
sevenz = new SevenZFile(tstream.getFile());
} else {
sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
}
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, metadata, xhtml);
}
entry = ais.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
// If it's an encrypted document of unknown password, report as such
if (zfe.getFeature() == Feature.ENCRYPTION) {
throw new EncryptedDocumentException(zfe);
}
// Otherwise throw the exception
throw new TikaException("UnsupportedZipFeature", zfe);
} catch (PasswordRequiredException pre) {
throw new EncryptedDocumentException(pre);
} finally {
ais.close();
tmp.close();
}
xhtml.endDocument();
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class PackageParser method parseEntry.
private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
String name = entry.getName();
if (archive.canReadEntryData(entry)) {
// Fetch the metadata on the entry contained in the archive
Metadata entrydata = handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), xhtml);
// Recurse into the entry if desired
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
// InputStream, which ArchiveInputStream isn't, so wrap
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(archive, tmp);
extractor.parseEmbedded(tis, xhtml, entrydata, true);
} finally {
tmp.dispose();
}
}
} else {
name = (name == null) ? "" : name;
if (entry instanceof ZipArchiveEntry) {
boolean usesEncryption = ((ZipArchiveEntry) entry).getGeneralPurposeBit().usesEncryption();
if (usesEncryption) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new EncryptedDocumentException("stream (" + name + ") is encrypted"), parentMetadata);
}
} else {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new TikaException("Can't read archive stream (" + name + ")"), parentMetadata);
}
if (name.length() > 0) {
xhtml.element("p", name);
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class RTFEmbObjHandler method extractObj.
private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException {
if (bytes == null) {
return;
}
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
TikaInputStream stream = TikaInputStream.get(bytes);
if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
String extension = embeddedDocumentUtil.getExtension(stream, metadata);
if (inObject && state == EMB_STATE.PICT) {
metadata.set(Metadata.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension);
metadata.set(RTFMetadata.THUMBNAIL, "true");
} else {
metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + extension);
}
}
try {
embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
} finally {
stream.close();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class PDFParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
PDDocument pdfDocument = null;
String password = "";
try {
// PDFBox can process entirely in memory, or can use a temp file
// for unpacked / processed resources
// Decide which to do based on if we're reading from a file or not already
//TODO: make this configurable via MemoryUsageSetting
TikaInputStream tstream = TikaInputStream.cast(stream);
password = getPassword(metadata, context);
if (tstream != null && tstream.hasFile()) {
// File based -- send file directly to PDFBox
pdfDocument = PDDocument.load(tstream.getPath().toFile(), password);
} else {
pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password);
}
metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
extractMetadata(pdfDocument, metadata, context);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
if (handler != null) {
if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
} else {
if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
metadata.add("X-Parsed-By", TesseractOCRParser.class.toString());
}
PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
}
}
} catch (InvalidPasswordException e) {
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class CommonsDigester method digest.
@Override
public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
TikaInputStream tis = TikaInputStream.cast(is);
if (tis != null && tis.hasFile()) {
long sz = -1;
if (tis.hasFile()) {
sz = tis.getLength();
}
//just digest the underlying file.
if (sz > markLimit) {
digestFile(tis.getFile(), m);
return;
}
}
//try the usual mark/reset stuff.
//however, if you actually hit the bound,
//then stop and spool to file via TikaInputStream
SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
boolean finishedStream = false;
for (DigestAlgorithm algorithm : algorithms) {
bis.mark(markLimit + 1);
finishedStream = digestEach(algorithm, bis, m);
bis.reset();
if (!finishedStream) {
break;
}
}
//spool to File and digest that.
if (!finishedStream) {
if (tis != null) {
digestFile(tis.getFile(), m);
} else {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
digestFile(tmpTikaInputStream.getFile(), m);
} finally {
try {
tmp.dispose();
} catch (TikaException e) {
throw new IOExceptionWithCause(e);
}
}
}
}
}
Aggregations