use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class GDALParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (!ExternalParser.check("gdalinfo")) {
return;
}
// first set up and run GDAL
// process the command
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
String runCommand = processCommand(tis);
String output = execCommand(new String[] { runCommand });
// now extract the actual metadata params
// from the GDAL output in the content stream
// to do this, we need to literally process the output
// from the invoked command b/c we can't read metadata and
// output text from the handler in ExternalParser
// at the same time, so for now, we can't use the
// ExternalParser to do this and I've had to bring some of
// that functionality directly into this class
// TODO: investigate a way to do both using ExternalParser
extractMetFromOutput(output, metadata);
applyPatternsToOutput(output, metadata, getPatterns());
// make the content handler and provide output there
// now that we have metadata
processOutput(handler, metadata, output);
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class TiffParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class OutlookPSTParser method parseMailAttachments.
private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor) throws TikaException {
int numberOfAttachments = email.getNumberOfAttachments();
for (int i = 0; i < numberOfAttachments; i++) {
File tempFile = null;
try {
PSTAttachment attach = email.getAttachment(i);
// Get the filename; both long and short filenames can be used for attachments
String filename = attach.getLongFilename();
if (filename.isEmpty()) {
filename = attach.getFilename();
}
xhtml.element("p", filename);
Metadata attachMeta = new Metadata();
attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", filename);
xhtml.startElement("div", attributes);
if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
} finally {
tmp.dispose();
}
}
xhtml.endElement("div");
} catch (Exception e) {
throw new TikaException("Unable to unpack document stream", e);
} finally {
if (tempFile != null)
tempFile.delete();
}
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class ISArchiveParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
try {
if (this.location == null) {
this.location = tis.getFile().getParent() + File.separator;
}
this.studyFileName = tis.getFile().getName();
File locationFile = new File(location);
String[] investigationList = locationFile.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.matches("i_.+\\.txt");
}
});
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
parseInvestigation(investigationList, xhtml, metadata, context);
parseStudy(stream, xhtml, metadata, context);
parseAssay(xhtml, metadata, context);
xhtml.endDocument();
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class JournalParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
File tmpFile = tis.getFile();
GrobidRESTParser grobidParser = new GrobidRESTParser();
grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);
PDFParser parser = new PDFParser();
parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
}
Aggregations