use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TesseractOCRParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract(config))
return;
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
//trigger the spooling to a tmp file if the stream wasn't
//already a TikaInputStream that contained a file
tikaStream.getPath();
//this is the text output file name specified on the tesseract
//commandline. The actual output file name will have a suffix added.
File tmpOCROutputFile = tmp.createTemporaryFile();
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
xhtml.endDocument();
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class TesseractOCRParser method parseInline.
/**
* Use this to parse content without starting a new document.
* This appends SAX events to xhtml without re-adding the metadata, body start, etc.
*
* @param stream inputstream
* @param xhtml handler
* @param config TesseractOCRConfig to use for this parse
* @throws IOException
* @throws SAXException
* @throws TikaException
*
*/
public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract(config))
return;
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File tmpImgFile = tmp.createTemporaryFile();
parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class AbstractPDF2XHTML method extractPDEmbeddedFile.
private void extractPDEmbeddedFile(String displayName, String unicodeFileName, String fileName, PDEmbeddedFile file, AttributesImpl attributes) throws SAXException, IOException, TikaException {
if (file == null) {
//skip silently
return;
}
fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
// TODO: other metadata?
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
return;
}
TikaInputStream stream = null;
try {
stream = TikaInputStream.get(file.createInputStream());
} catch (IOException e) {
//store this exception in the parent's metadata
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return;
}
try {
embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata, false);
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", fileName);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
} finally {
IOUtils.closeQuietly(stream);
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class RarParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
rar = new Archive(tis.getFile());
if (rar.isEncrypted()) {
throw new EncryptedDocumentException();
}
//Without this BodyContentHandler does not work
xhtml.element("div", " ");
FileHeader header = rar.nextFileHeader();
while (header != null && !Thread.currentThread().isInterrupted()) {
if (!header.isDirectory()) {
try (InputStream subFile = rar.getInputStream(header)) {
Metadata entrydata = PackageParser.handleEntryMetadata("".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(), header.getCTime(), header.getMTime(), header.getFullUnpackSize(), xhtml);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(subFile, handler, entrydata, true);
}
}
}
header = rar.nextFileHeader();
}
} catch (RarException e) {
throw new TikaException("RarParser Exception", e);
} finally {
if (rar != null)
rar.close();
}
xhtml.endDocument();
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class ParsingEmbeddedDocumentExtractor method parseEmbedded.
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
} catch (EncryptedDocumentException ede) {
// TODO: can we log a warning that we lack the password?
// For now, just skip the content
} catch (TikaException e) {
// TODO: can we log a warning somehow?
// Could not parse the entry, just skip the content
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
Aggregations