use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class PSDParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Check for the magic header signature
byte[] signature = new byte[4];
IOUtils.readFully(stream, signature);
if (signature[0] == (byte) '8' && signature[1] == (byte) 'B' && signature[2] == (byte) 'P' && signature[3] == (byte) 'S') {
// Good, signature found
} else {
throw new TikaException("PSD/PSB magic signature invalid");
}
// Check the version
int version = EndianUtils.readUShortBE(stream);
if (version == 1 || version == 2) {
// Good, we support these two
} else {
throw new TikaException("Invalid PSD/PSB version " + version);
}
// Skip the reserved block
IOUtils.readFully(stream, new byte[6]);
// Number of channels in the image
int numChannels = EndianUtils.readUShortBE(stream);
// TODO Identify a suitable metadata key for this
// Width and Height
int height = EndianUtils.readIntBE(stream);
int width = EndianUtils.readIntBE(stream);
metadata.set(TIFF.IMAGE_LENGTH, height);
metadata.set(TIFF.IMAGE_WIDTH, width);
// Depth (bits per channel)
int depth = EndianUtils.readUShortBE(stream);
metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
// Colour mode, eg Bitmap or RGB
int colorMode = EndianUtils.readUShortBE(stream);
metadata.set(Photoshop.COLOR_MODE, Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
// Next is the Color Mode section
// We don't care about this bit
long colorModeSectionSize = EndianUtils.readIntBE(stream);
stream.skip(colorModeSectionSize);
// Next is the Image Resources section
// Check for certain interesting keys here
long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
long read = 0;
while (read < imageResourcesSectionSize) {
ResourceBlock rb = new ResourceBlock(stream);
read += rb.totalLength;
// Is it one we can do something useful with?
if (rb.id == ResourceBlock.ID_CAPTION) {
metadata.add(TikaCoreProperties.DESCRIPTION, rb.getDataAsString());
} else if (rb.id == ResourceBlock.ID_EXIF_1) {
// TODO Parse the EXIF info via ImageMetadataExtractor
} else if (rb.id == ResourceBlock.ID_EXIF_3) {
// TODO Parse the EXIF info via ImageMetadataExtractor
} else if (rb.id == ResourceBlock.ID_XMP) {
// TODO Parse the XMP info via ImageMetadataExtractor
}
}
// Next is the Layer and Mask Info
// Finally we have Image Data
// We can't do anything with these parts
// We don't have any helpful text, sorry...
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class IptcAnpaParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
HashMap<String, String> properties = this.loadProperties(stream);
this.setMetadata(metadata, properties);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// TODO: put body content here
xhtml.startElement("p");
String body = clean(properties.get("body"));
if (body != null)
xhtml.characters(body);
xhtml.endElement("p");
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class ISArchiveParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp);
try {
if (this.location == null) {
this.location = tis.getFile().getParent() + File.separator;
}
this.studyFileName = tis.getFile().getName();
File locationFile = new File(location);
String[] investigationList = locationFile.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.matches("i_.+\\.txt");
}
});
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
parseInvestigation(investigationList, xhtml, metadata, context);
parseStudy(stream, xhtml, metadata, context);
parseAssay(xhtml, metadata, context);
xhtml.endDocument();
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class IWorkPackageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
ZipArchiveEntry entry = zip.getNextZipEntry();
while (entry != null) {
if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
entry = zip.getNextZipEntry();
continue;
}
InputStream entryStream = new BufferedInputStream(zip, 4096);
entryStream.mark(4096);
IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
entryStream.reset();
if (type != null) {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
ContentHandler contentHandler;
switch(type) {
case KEYNOTE:
contentHandler = new KeynoteContentHandler(xhtml, metadata);
break;
case NUMBERS:
contentHandler = new NumbersContentHandler(xhtml, metadata);
break;
case PAGES:
contentHandler = new PagesContentHandler(xhtml, metadata);
break;
case ENCRYPTED:
// We can't do anything for the file right now
contentHandler = null;
break;
default:
throw new TikaException("Unhandled iWorks file " + type);
}
metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
xhtml.startDocument();
if (contentHandler != null) {
context.getSAXParser().parse(new CloseShieldInputStream(entryStream), new OfflineContentHandler(contentHandler));
}
xhtml.endDocument();
}
entry = zip.getNextZipEntry();
}
// Don't close the zip InputStream (TIKA-1117).
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class AbstractDBParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
connection = getConnection(stream, metadata, context);
XHTMLContentHandler xHandler = null;
List<String> tableNames = null;
try {
tableNames = getTableNames(connection, metadata, context);
} catch (SQLException e) {
try {
close();
} catch (SQLException sqlE) {
//swallow
}
throw new IOExceptionWithCause(e);
}
for (String tableName : tableNames) {
//add table names to parent metadata
metadata.add(Database.TABLE_NAME, tableName);
}
xHandler = new XHTMLContentHandler(handler, metadata);
xHandler.startDocument();
try {
for (String tableName : tableNames) {
JDBCTableReader tableReader = getTableReader(connection, tableName, context);
xHandler.startElement("table", "name", tableReader.getTableName());
xHandler.startElement("thead");
xHandler.startElement("tr");
for (String header : tableReader.getHeaders()) {
xHandler.startElement("th");
xHandler.characters(header);
xHandler.endElement("th");
}
xHandler.endElement("tr");
xHandler.endElement("thead");
xHandler.startElement("tbody");
while (tableReader.nextRow(xHandler, context)) {
//no-op
}
xHandler.endElement("tbody");
xHandler.endElement("table");
}
} finally {
try {
close();
} catch (IOException | SQLException e) {
//swallow
}
if (xHandler != null) {
xHandler.endDocument();
}
}
}
Aggregations