use of org.apache.tika.exception.TikaException in project tika by apache.
the class CompressorParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// should not be closed
if (stream.markSupported()) {
stream = new CloseShieldInputStream(stream);
} else {
// Ensure that the stream supports the mark feature
stream = new BufferedInputStream(new CloseShieldInputStream(stream));
}
CompressorInputStream cis;
try {
CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() {
public boolean decompressConcatenated(Metadata metadata) {
return false;
}
});
CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
cis = factory.createCompressorInputStream(stream);
} catch (CompressorException e) {
if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
throw new TikaMemoryLimitException(e.getMessage());
}
throw new TikaException("Unable to uncompress document stream", e);
}
MediaType type = getMediaType(cis);
if (!type.equals(MediaType.OCTET_STREAM)) {
metadata.set(CONTENT_TYPE, type.toString());
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
Metadata entrydata = new Metadata();
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
if (name.endsWith(".tbz")) {
name = name.substring(0, name.length() - 4) + ".tar";
} else if (name.endsWith(".tbz2")) {
name = name.substring(0, name.length() - 5) + ".tar";
} else if (name.endsWith(".bz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".bz2")) {
name = name.substring(0, name.length() - 4);
} else if (name.endsWith(".xz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".zlib")) {
name = name.substring(0, name.length() - 5);
} else if (name.endsWith(".pack")) {
name = name.substring(0, name.length() - 5);
} else if (name.length() > 0) {
name = GzipUtils.getUncompressedFilename(name);
}
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(cis, xhtml, entrydata, true);
}
} finally {
cis.close();
}
xhtml.endDocument();
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}
// Ensure that the stream supports the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
TemporaryResources tmp = new TemporaryResources();
ArchiveInputStream ais = null;
try {
ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
// Seven Zip suports passwords, was one given?
String password = null;
PasswordProvider provider = context.get(PasswordProvider.class);
if (provider != null) {
password = provider.getPassword(metadata);
}
SevenZFile sevenz;
if (password == null) {
sevenz = new SevenZFile(tstream.getFile());
} else {
sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
}
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, metadata, xhtml);
}
entry = ais.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
// If it's an encrypted document of unknown password, report as such
if (zfe.getFeature() == Feature.ENCRYPTION) {
throw new EncryptedDocumentException(zfe);
}
// Otherwise throw the exception
throw new TikaException("UnsupportedZipFeature", zfe);
} catch (PasswordRequiredException pre) {
throw new EncryptedDocumentException(pre);
} finally {
ais.close();
tmp.close();
}
xhtml.endDocument();
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PackageParser method parseEntry.
private void parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
String name = entry.getName();
if (archive.canReadEntryData(entry)) {
// Fetch the metadata on the entry contained in the archive
Metadata entrydata = handleEntryMetadata(name, null, entry.getLastModifiedDate(), entry.getSize(), xhtml);
// Recurse into the entry if desired
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
// InputStream, which ArchiveInputStream isn't, so wrap
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(archive, tmp);
extractor.parseEmbedded(tis, xhtml, entrydata, true);
} finally {
tmp.dispose();
}
}
} else {
name = (name == null) ? "" : name;
if (entry instanceof ZipArchiveEntry) {
boolean usesEncryption = ((ZipArchiveEntry) entry).getGeneralPurposeBit().usesEncryption();
if (usesEncryption) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new EncryptedDocumentException("stream (" + name + ") is encrypted"), parentMetadata);
}
} else {
EmbeddedDocumentUtil.recordEmbeddedStreamException(new TikaException("Can't read archive stream (" + name + ")"), parentMetadata);
}
if (name.length() > 0) {
xhtml.element("p", name);
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class TensorflowRESTRecogniser method recognise.
@Override
public List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
List<RecognisedObject> recObjs = new ArrayList<>();
try {
DefaultHttpClient client = new DefaultHttpClient();
HttpPost request = new HttpPost(getApiUri(metadata));
try (ByteArrayOutputStream byteStream = new ByteArrayOutputStream()) {
//TODO: convert this to stream, this might cause OOM issue
// InputStreamEntity is not working
// request.setEntity(new InputStreamEntity(stream, -1));
IOUtils.copy(stream, byteStream);
request.setEntity(new ByteArrayEntity(byteStream.toByteArray()));
}
HttpResponse response = client.execute(request);
try (InputStream reply = response.getEntity().getContent()) {
String replyMessage = IOUtils.toString(reply);
if (response.getStatusLine().getStatusCode() == 200) {
JSONObject jReply = new JSONObject(replyMessage);
JSONArray jClasses = jReply.getJSONArray("classnames");
JSONArray jConfidence = jReply.getJSONArray("confidence");
if (jClasses.length() != jConfidence.length()) {
LOG.warn("Classes of size {} is not equal to confidence of size {}", jClasses.length(), jConfidence.length());
}
assert jClasses.length() == jConfidence.length();
for (int i = 0; i < jClasses.length(); i++) {
RecognisedObject recObj = new RecognisedObject(jClasses.getString(i), LABEL_LANG, jClasses.getString(i), jConfidence.getDouble(i));
recObjs.add(recObj);
}
} else {
LOG.warn("Status = {}", response.getStatusLine());
LOG.warn("Response = {}", replyMessage);
}
}
} catch (Exception e) {
LOG.warn(e.getMessage(), e);
}
LOG.debug("Num Objects found {}", recObjs.size());
return recObjs;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class OCR2XHTML method processPage.
@Override
public void processPage(PDPage pdPage) throws IOException {
try {
startPage(pdPage);
doOCROnCurrentPage();
endPage(pdPage);
} catch (TikaException | SAXException e) {
throw new IOExceptionWithCause(e);
} catch (IOException e) {
handleCatchableIOE(e);
}
}
Aggregations