use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class JpegParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class GeographicInformationParser method parse.
@Override
public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, geoInfoType);
DataStore dataStore = null;
DefaultMetadata defaultMetadata = null;
XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(contentHandler, metadata);
TemporaryResources tmp = TikaInputStream.isTikaInputStream(inputStream) ? null : new TemporaryResources();
try {
TikaInputStream tikaInputStream = TikaInputStream.get(inputStream, tmp);
File file = tikaInputStream.getFile();
dataStore = DataStores.open(file);
defaultMetadata = new DefaultMetadata(dataStore.getMetadata());
if (defaultMetadata != null)
extract(xhtmlContentHandler, metadata, defaultMetadata);
} catch (UnsupportedStorageException e) {
throw new TikaException("UnsupportedStorageException", e);
} catch (DataStoreException e) {
throw new TikaException("DataStoreException", e);
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class GribParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//Set MIME type as grib2
metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE);
TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
File gribFile = tis.getFile();
try {
NetcdfFile ncFile = NetcdfDataset.openFile(gribFile.getAbsolutePath(), null);
// first parse out the set of global attributes
for (Attribute attr : ncFile.getGlobalAttributes()) {
Property property = resolveMetadataKey(attr.getFullName());
if (attr.getDataType().isString()) {
metadata.add(property, attr.getStringValue());
} else if (attr.getDataType().isNumeric()) {
int value = attr.getNumericValue().intValue();
metadata.add(property, String.valueOf(value));
}
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.newline();
xhtml.startElement("ul");
xhtml.characters("dimensions:");
xhtml.newline();
for (Dimension dim : ncFile.getDimensions()) {
xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";");
xhtml.newline();
}
xhtml.startElement("ul");
xhtml.characters("variables:");
xhtml.newline();
for (Variable var : ncFile.getVariables()) {
xhtml.element("p", String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";");
for (Attribute element : var.getAttributes()) {
xhtml.element("li", " :" + element + ";");
xhtml.newline();
}
}
xhtml.endElement("ul");
xhtml.endElement("ul");
xhtml.endDocument();
} catch (IOException e) {
throw new TikaException("NetCDF parse error", e);
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class HDFParser method parse.
/*
* (non-Javadoc)
*
* @see
* org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
* org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
* org.apache.tika.parser.ParseContext)
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
ByteArrayOutputStream os = new ByteArrayOutputStream();
IOUtils.copy(stream, os);
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name == null) {
name = "";
}
try {
NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
unravelStringMet(ncFile, null, metadata);
} catch (IOException e) {
throw new TikaException("HDF parse error", e);
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class BPGParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Check for the magic header signature
byte[] signature = new byte[4];
IOUtils.readFully(stream, signature);
if (signature[0] == (byte) 'B' && signature[1] == (byte) 'P' && signature[2] == (byte) 'G' && signature[3] == (byte) 0xfb) {
// Good, signature found
} else {
throw new TikaException("BPG magic signature invalid");
}
// Grab and decode the first byte
int pdf = stream.read();
// Pixel format: Greyscale / 4:2:0 / 4:2:2 / 4:4:4
int pixelFormat = pdf & 0x7;
// TODO Identify a suitable metadata key for this
// Is there an alpha plane as well as a colour plane?
boolean hasAlphaPlane1 = (pdf & 0x8) == 0x8;
// TODO Identify a suitable metadata key for this+hasAlphaPlane2
// Bit depth minus 8
int bitDepth = (pdf >> 4) + 8;
metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(bitDepth));
// Grab and decode the second byte
int cer = stream.read();
// Colour Space: YCbCr / RGB / YCgCo / YCbCrK / CMYK
int colourSpace = cer & 0x15;
switch(colourSpace) {
case 0:
metadata.set(Photoshop.COLOR_MODE, "YCbCr Colour");
break;
case 1:
metadata.set(Photoshop.COLOR_MODE, "RGB Colour");
break;
case 2:
metadata.set(Photoshop.COLOR_MODE, "YCgCo Colour");
break;
case 3:
metadata.set(Photoshop.COLOR_MODE, "YCbCrK Colour");
break;
case 4:
metadata.set(Photoshop.COLOR_MODE, "CMYK Colour");
break;
}
// Are there extensions or not?
boolean hasExtensions = (cer & 16) == 16;
// Is the Alpha Plane 2 flag set?
boolean hasAlphaPlane2 = (cer & 32) == 32;
// cer then holds 2 more booleans - limited range, reserved
// Width and height next
int width = (int) EndianUtils.readUE7(stream);
int height = (int) EndianUtils.readUE7(stream);
metadata.set(TIFF.IMAGE_LENGTH, height);
metadata.set(TIFF.IMAGE_WIDTH, width);
// Picture Data length
EndianUtils.readUE7(stream);
// Extension Data Length, if extensions present
long extensionDataLength = 0;
if (hasExtensions)
extensionDataLength = EndianUtils.readUE7(stream);
// Alpha Data Length, if alpha used
long alphaDataLength = 0;
if (hasAlphaPlane1 || hasAlphaPlane2)
alphaDataLength = EndianUtils.readUE7(stream);
// Extension Data
if (hasExtensions) {
long extensionsDataSeen = 0;
ImageMetadataExtractor metadataExtractor = new ImageMetadataExtractor(metadata);
while (extensionsDataSeen < extensionDataLength) {
int extensionType = (int) EndianUtils.readUE7(stream);
int extensionLength = (int) EndianUtils.readUE7(stream);
switch(extensionType) {
case EXTENSION_TAG_EXIF:
metadataExtractor.parseRawExif(stream, extensionLength, true);
break;
case EXTENSION_TAG_XMP:
handleXMP(stream, extensionLength, metadataExtractor);
break;
default:
stream.skip(extensionLength);
}
extensionsDataSeen += extensionLength;
}
}
// HEVC Header + Data
// Alpha HEVC Header + Data
// We can't do anything with these parts
// We don't have any helpful text, sorry...
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
Aggregations