use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class TesseractOCRParser method parse.
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
FileOutputStream fos = null;
TikaInputStream tis = null;
try {
int w = image.getWidth(null);
int h = image.getHeight(null);
BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
File file = tmp.createTemporaryFile();
fos = new FileOutputStream(file);
ImageIO.write(bImage, "png", fos);
tis = TikaInputStream.get(file);
parse(tis, handler, metadata, context);
} finally {
tmp.dispose();
if (tis != null)
tis.close();
if (fos != null)
fos.close();
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class PackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}
// Ensure that the stream supports the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
TemporaryResources tmp = new TemporaryResources();
ArchiveInputStream ais = null;
try {
ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
// Seven Zip suports passwords, was one given?
String password = null;
PasswordProvider provider = context.get(PasswordProvider.class);
if (provider != null) {
password = provider.getPassword(metadata);
}
SevenZFile sevenz;
if (password == null) {
sevenz = new SevenZFile(tstream.getFile());
} else {
sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
}
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, metadata, xhtml);
}
entry = ais.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
// If it's an encrypted document of unknown password, report as such
if (zfe.getFeature() == Feature.ENCRYPTION) {
throw new EncryptedDocumentException(zfe);
}
// Otherwise throw the exception
throw new TikaException("UnsupportedZipFeature", zfe);
} catch (PasswordRequiredException pre) {
throw new EncryptedDocumentException(pre);
} finally {
ais.close();
tmp.close();
}
xhtml.endDocument();
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class PackageParser method parseEntry.
private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
String name = entry.getName();
if (archive.canReadEntryData(entry)) {
// Fetch the metadata on the entry contained in the archive
Metadata entrydata = handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), xhtml);
// Recurse into the entry if desired
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
// InputStream, which ArchiveInputStream isn't, so wrap
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(archive, tmp);
extractor.parseEmbedded(tis, xhtml, entrydata, true);
} finally {
tmp.dispose();
}
}
} else {
name = (name == null) ? "" : name;
if (entry instanceof ZipArchiveEntry) {
boolean usesEncryption = ((ZipArchiveEntry) entry).getGeneralPurposeBit().usesEncryption();
if (usesEncryption) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new EncryptedDocumentException("stream (" + name + ") is encrypted"), parentMetadata);
}
} else {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new TikaException("Can't read archive stream (" + name + ")"), parentMetadata);
}
if (name.length() > 0) {
xhtml.element("p", name);
}
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class CommonsDigester method digest.
@Override
public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
TikaInputStream tis = TikaInputStream.cast(is);
if (tis != null && tis.hasFile()) {
long sz = -1;
if (tis.hasFile()) {
sz = tis.getLength();
}
//just digest the underlying file.
if (sz > markLimit) {
digestFile(tis.getFile(), m);
return;
}
}
//try the usual mark/reset stuff.
//however, if you actually hit the bound,
//then stop and spool to file via TikaInputStream
SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
boolean finishedStream = false;
for (DigestAlgorithm algorithm : algorithms) {
bis.mark(markLimit + 1);
finishedStream = digestEach(algorithm, bis, m);
bis.reset();
if (!finishedStream) {
break;
}
}
//spool to File and digest that.
if (!finishedStream) {
if (tis != null) {
digestFile(tis.getFile(), m);
} else {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
digestFile(tmpTikaInputStream.getFile(), m);
} finally {
try {
tmp.dispose();
} catch (TikaException e) {
throw new IOExceptionWithCause(e);
}
}
}
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class PooledTimeSeriesParser method parse.
/**
* Parses a document stream into a sequence of XHTML SAX events. Fills in
* related document metadata in the given metadata object.
* <p>
* The given document stream is consumed but not closed by this method. The
* responsibility to close the stream remains on the caller.
* <p>
* Information about the parsing context can be passed in the context
* parameter. See the parser implementations for the kinds of context
* information they expect.
*
* @param stream the document stream (input)
* @param handler handler for the XHTML SAX events (output)
* @param metadata document metadata (input and output)
* @param context parse context
* @throws IOException if the document stream could not be read
* @throws SAXException if the SAX events could not be processed
* @throws TikaException if the document could not be parsed
* @since Apache Tika 0.5
*/
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (!isAvailable) {
LOG.warn("PooledTimeSeries not installed!");
return;
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File input = tikaStream.getFile();
String cmdOutput = computePoT(input);
try (InputStream ofStream = new FileInputStream(new File(input.getAbsoluteFile() + ".of.txt"))) {
try (InputStream ogStream = new FileInputStream(new File(input.getAbsoluteFile() + ".hog.txt"))) {
extractHeaderOutput(ofStream, metadata, "of");
extractHeaderOutput(ogStream, metadata, "og");
xhtml.startDocument();
doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)", metadata.get("of_frames"), metadata.get("of_vecSize"));
doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)", metadata.get("og_frames"), metadata.get("og_vecSize"));
xhtml.endDocument();
}
}
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_VIDEO_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
tmp.dispose();
}
}
Aggregations