use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class ExternalParser method parse.
/**
* Executes the configured external command and passes the given document
* stream as a simple XHTML document to the given SAX content handler.
* Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
* has been called to set patterns.
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
TemporaryResources tmp = new TemporaryResources();
try {
parse(TikaInputStream.get(stream, tmp), xhtml, metadata, tmp);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class AutoDetectParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// TIKA-216: Zip bomb prevention
SecureContentHandler sch = handler != null ? new SecureContentHandler(handler, tis) : null;
//the caller hasn't specified one.
if (context.get(EmbeddedDocumentExtractor.class) == null) {
Parser p = context.get(Parser.class);
if (p == null) {
context.set(Parser.class, this);
}
context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));
}
try {
// Parse the document
super.parse(tis, sch, metadata, context);
} catch (SAXException e) {
// Convert zip bomb exceptions to TikaExceptions
sch.throwIfCauseOf(e);
throw e;
}
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class DigestingParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
try {
if (digester != null) {
digester.digest(tis, metadata, context);
}
super.parse(tis, handler, metadata, context);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class ZipContainerDetector method detect.
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(input, tmp);
// enough for all known formats
byte[] prefix = new byte[1024];
int length = tis.peek(prefix);
MediaType type = detectArchiveFormat(prefix, length);
if (PackageParser.isZipArchive(type) && TikaInputStream.isTikaInputStream(input)) {
return detectZipFormat(tis);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
return detectCompressorFormat(prefix, length);
}
} finally {
try {
tmp.dispose();
} catch (TikaException e) {
// ignore
}
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class NetCDFParser method parse.
/*
* (non-Javadoc)
*
* @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
* org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
* org.apache.tika.parser.ParseContext)
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
NetcdfFile ncFile = null;
try {
ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath());
metadata.set("File-Type-Description", ncFile.getFileTypeDescription());
// first parse out the set of global attributes
for (Attribute attr : ncFile.getGlobalAttributes()) {
Property property = resolveMetadataKey(attr.getFullName());
if (attr.getDataType().isString()) {
metadata.add(property, attr.getStringValue());
} else if (attr.getDataType().isNumeric()) {
int value = attr.getNumericValue().intValue();
metadata.add(property, String.valueOf(value));
}
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.newline();
xhtml.element("h1", "dimensions");
xhtml.startElement("ul");
xhtml.newline();
for (Dimension dim : ncFile.getDimensions()) {
xhtml.element("li", dim.getFullName() + " = " + dim.getLength());
}
xhtml.endElement("ul");
xhtml.element("h1", "variables");
xhtml.startElement("ul");
xhtml.newline();
for (Variable var : ncFile.getVariables()) {
xhtml.startElement("li");
xhtml.characters(var.getDataType() + " " + var.getNameAndDimensions());
xhtml.newline();
List<Attribute> attributes = var.getAttributes();
if (!attributes.isEmpty()) {
xhtml.startElement("ul");
for (Attribute element : attributes) {
xhtml.element("li", element.toString());
}
xhtml.endElement("ul");
}
xhtml.endElement("li");
}
xhtml.endElement("ul");
xhtml.endDocument();
} catch (IOException e) {
throw new TikaException("NetCDF parse error", e);
} finally {
if (ncFile != null) {
ncFile.close();
}
if (tmp != null) {
tmp.dispose();
}
}
}
Aggregations