use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class JackcessParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TikaInputStream tis = TikaInputStream.get(stream);
Database db = null;
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
String password = null;
PasswordProvider passwordProvider = context.get(PasswordProvider.class);
if (passwordProvider != null) {
password = passwordProvider.getPassword(metadata);
}
try {
if (password == null) {
//do this to ensure encryption/wrong password exception vs. more generic
//"need right codec" error message.
db = new DatabaseBuilder(tis.getFile()).setCodecProvider(new CryptCodecProvider()).setReadOnly(true).open();
} else {
db = new DatabaseBuilder(tis.getFile()).setCodecProvider(new CryptCodecProvider(password)).setReadOnly(true).open();
}
//just in case
db.setLinkResolver(IGNORE_LINK_RESOLVER);
JackcessExtractor ex = new JackcessExtractor(metadata, context, locale);
ex.parse(db, xhtml);
} catch (IllegalStateException e) {
if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) {
throw new EncryptedDocumentException(e);
}
throw e;
} finally {
if (db != null) {
try {
db.close();
} catch (IOException e) {
//swallow = silent close
}
}
}
xhtml.endDocument();
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class AbstractOOXMLExtractor method handleEmbeddedOLE.
/**
* Handles an embedded OLE object in the document
*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel, Metadata parentMetadata) throws IOException, SAXException {
// A POIFSFileSystem needs to be at least 3 blocks big to be valid
if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
// Too small, skip
return;
}
InputStream is = part.getInputStream();
// Open the POIFS (OLE2) structure and process
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(part.getInputStream());
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
TikaInputStream stream = null;
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
DirectoryNode root = fs.getRoot();
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (root.hasEntry("CONTENTS") && root.hasEntry("Ole") && root.hasEntry("CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
}
} else if (POIFSDocumentType.OLE10_NATIVE == type) {
// TIKA-704: OLE 1.0 embedded document
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
}
if (ole.getCommand() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
if (ole.getFileName() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
}
byte[] data = ole.getDataBuffer();
if (data != null) {
stream = TikaInputStream.get(data);
}
if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
}
} else {
handleEmbeddedFile(part, handler, rel);
}
} catch (FileNotFoundException e) {
// There was no CONTENTS entry, so skip this part
} catch (Ole10NativeException e) {
// Could not process an OLE 1.0 entry, so skip this part
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
} finally {
if (fs != null) {
fs.close();
}
if (stream != null) {
stream.close();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class AbstractOOXMLExtractor method handleEmbeddedFile.
/**
* Handles an embedded file in the document
*/
protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) throws SAXException, IOException {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
// Get the name
String name = part.getPartName().getName();
metadata.set(Metadata.RESOURCE_NAME_KEY, name.substring(name.lastIndexOf('/') + 1));
// Get the content type
metadata.set(Metadata.CONTENT_TYPE, part.getContentType());
// Call the recursing handler
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
try (TikaInputStream tis = TikaInputStream.get(part.getInputStream())) {
embeddedExtractor.parseEmbedded(tis, new EmbeddedContentHandler(handler), metadata, false);
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class NetCDFParser method parse.
/*
* (non-Javadoc)
*
* @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
* org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
* org.apache.tika.parser.ParseContext)
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
NetcdfFile ncFile = null;
try {
ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath());
metadata.set("File-Type-Description", ncFile.getFileTypeDescription());
// first parse out the set of global attributes
for (Attribute attr : ncFile.getGlobalAttributes()) {
Property property = resolveMetadataKey(attr.getFullName());
if (attr.getDataType().isString()) {
metadata.add(property, attr.getStringValue());
} else if (attr.getDataType().isNumeric()) {
int value = attr.getNumericValue().intValue();
metadata.add(property, String.valueOf(value));
}
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.newline();
xhtml.element("h1", "dimensions");
xhtml.startElement("ul");
xhtml.newline();
for (Dimension dim : ncFile.getDimensions()) {
xhtml.element("li", dim.getFullName() + " = " + dim.getLength());
}
xhtml.endElement("ul");
xhtml.element("h1", "variables");
xhtml.startElement("ul");
xhtml.newline();
for (Variable var : ncFile.getVariables()) {
xhtml.startElement("li");
xhtml.characters(var.getDataType() + " " + var.getNameAndDimensions());
xhtml.newline();
List<Attribute> attributes = var.getAttributes();
if (!attributes.isEmpty()) {
xhtml.startElement("ul");
for (Attribute element : attributes) {
xhtml.element("li", element.toString());
}
xhtml.endElement("ul");
}
xhtml.endElement("li");
}
xhtml.endElement("ul");
xhtml.endDocument();
} catch (IOException e) {
throw new TikaException("NetCDF parse error", e);
} finally {
if (ncFile != null) {
ncFile.close();
}
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TesseractOCRParser method parse.
private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
File tmpTxtOutput = null;
try {
File input = tikaInputStream.getFile();
long size = tikaInputStream.getLength();
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
// Process image if ImageMagick Tool is present
if (config.isEnableImageProcessing() == 1 && hasImageMagick(config)) {
// copy the contents of the original input file into a temporary file
// which will be preprocessed for OCR
TemporaryResources tmp = new TemporaryResources();
try {
File tmpFile = tmp.createTemporaryFile();
FileUtils.copyFile(input, tmpFile);
processImage(tmpFile, config);
doOCR(tmpFile, tmpOCROutputFile, config);
} finally {
if (tmp != null) {
tmp.dispose();
}
}
} else {
doOCR(input, tmpOCROutputFile, config);
}
// Tesseract appends the output type (.txt or .hocr) to output file name
tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + config.getOutputType().toString().toLowerCase(Locale.US));
if (tmpTxtOutput.exists()) {
try (InputStream is = new FileInputStream(tmpTxtOutput)) {
if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
extractHOCROutput(is, parseContext, xhtml);
} else {
extractOutput(is, xhtml);
}
}
}
}
} finally {
if (tmpTxtOutput != null) {
tmpTxtOutput.delete();
}
}
}
Aggregations