use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class PDFParser method handleXFAOnly.
private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException {
XFAExtractor ex = new XFAExtractor();
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try (InputStream is = new ByteArrayInputStream(pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
ex.extract(is, xhtml, metadata, context);
} catch (XMLStreamException e) {
throw new TikaException("XML error in XFA", e);
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class CompressorParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// should not be closed
if (stream.markSupported()) {
stream = new CloseShieldInputStream(stream);
} else {
// Ensure that the stream supports the mark feature
stream = new BufferedInputStream(new CloseShieldInputStream(stream));
}
CompressorInputStream cis;
try {
CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() {
public boolean decompressConcatenated(Metadata metadata) {
return false;
}
});
CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
cis = factory.createCompressorInputStream(stream);
} catch (CompressorException e) {
if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
throw new TikaMemoryLimitException(e.getMessage());
}
throw new TikaException("Unable to uncompress document stream", e);
}
MediaType type = getMediaType(cis);
if (!type.equals(MediaType.OCTET_STREAM)) {
metadata.set(CONTENT_TYPE, type.toString());
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
Metadata entrydata = new Metadata();
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
if (name.endsWith(".tbz")) {
name = name.substring(0, name.length() - 4) + ".tar";
} else if (name.endsWith(".tbz2")) {
name = name.substring(0, name.length() - 5) + ".tar";
} else if (name.endsWith(".bz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".bz2")) {
name = name.substring(0, name.length() - 4);
} else if (name.endsWith(".xz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".zlib")) {
name = name.substring(0, name.length() - 5);
} else if (name.endsWith(".pack")) {
name = name.substring(0, name.length() - 5);
} else if (name.length() > 0) {
name = GzipUtils.getUncompressedFilename(name);
}
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(cis, xhtml, entrydata, true);
}
} finally {
cis.close();
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class PackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}
// Ensure that the stream supports the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
TemporaryResources tmp = new TemporaryResources();
ArchiveInputStream ais = null;
try {
ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
// Seven Zip suports passwords, was one given?
String password = null;
PasswordProvider provider = context.get(PasswordProvider.class);
if (provider != null) {
password = provider.getPassword(metadata);
}
SevenZFile sevenz;
if (password == null) {
sevenz = new SevenZFile(tstream.getFile());
} else {
sevenz = new SevenZFile(tstream.getFile(), password.getBytes("UnicodeLittleUnmarked"));
}
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, metadata, xhtml);
}
entry = ais.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
// If it's an encrypted document of unknown password, report as such
if (zfe.getFeature() == Feature.ENCRYPTION) {
throw new EncryptedDocumentException(zfe);
}
// Otherwise throw the exception
throw new TikaException("UnsupportedZipFeature", zfe);
} catch (PasswordRequiredException pre) {
throw new EncryptedDocumentException(pre);
} finally {
ais.close();
tmp.close();
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class TXTParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
//try to get detected content type; could be a subclass of text/plain
//such as vcal, etc.
String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
MediaType mediaType = MediaType.TEXT_PLAIN;
if (incomingMime != null) {
MediaType tmpMediaType = MediaType.parse(incomingMime);
if (tmpMediaType != null) {
mediaType = tmpMediaType;
}
}
Charset charset = reader.getCharset();
MediaType type = new MediaType(mediaType, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class OpenDocumentParser method parse.
public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
ZipFile zipFile = null;
ZipInputStream zipStream = null;
if (stream instanceof TikaInputStream) {
TikaInputStream tis = (TikaInputStream) stream;
Object container = ((TikaInputStream) stream).getOpenContainer();
if (container instanceof ZipFile) {
zipFile = (ZipFile) container;
} else if (tis.hasFile()) {
zipFile = new ZipFile(tis.getFile());
} else {
zipStream = new ZipInputStream(stream);
}
} else {
zipStream = new ZipInputStream(stream);
}
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
}
}
// Only now call the end document
if (handler.getEndDocumentWasCalled()) {
handler.reallyEndDocument();
}
}
Aggregations