Search in sources :

Example 41 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class JpegParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
        new JempboxExtractor(metadata).parse(tis);
    } finally {
        tmp.dispose();
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : JempboxExtractor(org.apache.tika.parser.image.xmp.JempboxExtractor) ImageMetadataExtractor(org.apache.tika.parser.image.ImageMetadataExtractor) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 42 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class GeographicInformationParser method parse.

@Override
public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
    metadata.set(Metadata.CONTENT_TYPE, geoInfoType);
    DataStore dataStore = null;
    DefaultMetadata defaultMetadata = null;
    XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(contentHandler, metadata);
    TemporaryResources tmp = TikaInputStream.isTikaInputStream(inputStream) ? null : new TemporaryResources();
    try {
        TikaInputStream tikaInputStream = TikaInputStream.get(inputStream, tmp);
        File file = tikaInputStream.getFile();
        dataStore = DataStores.open(file);
        defaultMetadata = new DefaultMetadata(dataStore.getMetadata());
        if (defaultMetadata != null)
            extract(xhtmlContentHandler, metadata, defaultMetadata);
    } catch (UnsupportedStorageException e) {
        throw new TikaException("UnsupportedStorageException", e);
    } catch (DataStoreException e) {
        throw new TikaException("DataStoreException", e);
    } finally {
        if (tmp != null) {
            tmp.dispose();
        }
    }
}
Also used : DataStoreException(org.apache.sis.storage.DataStoreException) TikaException(org.apache.tika.exception.TikaException) DataStore(org.apache.sis.storage.DataStore) DefaultMetadata(org.apache.sis.metadata.iso.DefaultMetadata) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File) UnsupportedStorageException(org.apache.sis.storage.UnsupportedStorageException)

Example 43 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class GribParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    //Set MIME type as grib2
    metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE);
    TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
    File gribFile = tis.getFile();
    try {
        NetcdfFile ncFile = NetcdfDataset.openFile(gribFile.getAbsolutePath(), null);
        // first parse out the set of global attributes
        for (Attribute attr : ncFile.getGlobalAttributes()) {
            Property property = resolveMetadataKey(attr.getFullName());
            if (attr.getDataType().isString()) {
                metadata.add(property, attr.getStringValue());
            } else if (attr.getDataType().isNumeric()) {
                int value = attr.getNumericValue().intValue();
                metadata.add(property, String.valueOf(value));
            }
        }
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.newline();
        xhtml.startElement("ul");
        xhtml.characters("dimensions:");
        xhtml.newline();
        for (Dimension dim : ncFile.getDimensions()) {
            xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";");
            xhtml.newline();
        }
        xhtml.startElement("ul");
        xhtml.characters("variables:");
        xhtml.newline();
        for (Variable var : ncFile.getVariables()) {
            xhtml.element("p", String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";");
            for (Attribute element : var.getAttributes()) {
                xhtml.element("li", " :" + element + ";");
                xhtml.newline();
            }
        }
        xhtml.endElement("ul");
        xhtml.endElement("ul");
        xhtml.endDocument();
    } catch (IOException e) {
        throw new TikaException("NetCDF parse error", e);
    }
}
Also used : NetcdfFile(ucar.nc2.NetcdfFile) Variable(ucar.nc2.Variable) TikaException(org.apache.tika.exception.TikaException) Attribute(ucar.nc2.Attribute) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) Dimension(ucar.nc2.Dimension) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File) NetcdfFile(ucar.nc2.NetcdfFile) Property(org.apache.tika.metadata.Property)

Example 44 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class HDFParser method parse.

/*
     * (non-Javadoc)
     * 
     * @see
     * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
     * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
     * org.apache.tika.parser.ParseContext)
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    IOUtils.copy(stream, os);
    String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
    if (name == null) {
        name = "";
    }
    try {
        NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
        unravelStringMet(ncFile, null, metadata);
    } catch (IOException e) {
        throw new TikaException("HDF parse error", e);
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : NetcdfFile(ucar.nc2.NetcdfFile) TikaException(org.apache.tika.exception.TikaException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 45 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class BPGParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Check for the magic header signature
    byte[] signature = new byte[4];
    IOUtils.readFully(stream, signature);
    if (signature[0] == (byte) 'B' && signature[1] == (byte) 'P' && signature[2] == (byte) 'G' && signature[3] == (byte) 0xfb) {
    // Good, signature found
    } else {
        throw new TikaException("BPG magic signature invalid");
    }
    // Grab and decode the first byte
    int pdf = stream.read();
    // Pixel format: Greyscale / 4:2:0 / 4:2:2 / 4:4:4
    int pixelFormat = pdf & 0x7;
    // TODO Identify a suitable metadata key for this
    // Is there an alpha plane as well as a colour plane?
    boolean hasAlphaPlane1 = (pdf & 0x8) == 0x8;
    // TODO Identify a suitable metadata key for this+hasAlphaPlane2
    // Bit depth minus 8
    int bitDepth = (pdf >> 4) + 8;
    metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(bitDepth));
    // Grab and decode the second byte
    int cer = stream.read();
    // Colour Space: YCbCr / RGB / YCgCo / YCbCrK / CMYK
    int colourSpace = cer & 0x15;
    switch(colourSpace) {
        case 0:
            metadata.set(Photoshop.COLOR_MODE, "YCbCr Colour");
            break;
        case 1:
            metadata.set(Photoshop.COLOR_MODE, "RGB Colour");
            break;
        case 2:
            metadata.set(Photoshop.COLOR_MODE, "YCgCo Colour");
            break;
        case 3:
            metadata.set(Photoshop.COLOR_MODE, "YCbCrK Colour");
            break;
        case 4:
            metadata.set(Photoshop.COLOR_MODE, "CMYK Colour");
            break;
    }
    // Are there extensions or not?
    boolean hasExtensions = (cer & 16) == 16;
    // Is the Alpha Plane 2 flag set?
    boolean hasAlphaPlane2 = (cer & 32) == 32;
    // cer then holds 2 more booleans - limited range, reserved
    // Width and height next
    int width = (int) EndianUtils.readUE7(stream);
    int height = (int) EndianUtils.readUE7(stream);
    metadata.set(TIFF.IMAGE_LENGTH, height);
    metadata.set(TIFF.IMAGE_WIDTH, width);
    // Picture Data length
    EndianUtils.readUE7(stream);
    // Extension Data Length, if extensions present
    long extensionDataLength = 0;
    if (hasExtensions)
        extensionDataLength = EndianUtils.readUE7(stream);
    // Alpha Data Length, if alpha used
    long alphaDataLength = 0;
    if (hasAlphaPlane1 || hasAlphaPlane2)
        alphaDataLength = EndianUtils.readUE7(stream);
    // Extension Data
    if (hasExtensions) {
        long extensionsDataSeen = 0;
        ImageMetadataExtractor metadataExtractor = new ImageMetadataExtractor(metadata);
        while (extensionsDataSeen < extensionDataLength) {
            int extensionType = (int) EndianUtils.readUE7(stream);
            int extensionLength = (int) EndianUtils.readUE7(stream);
            switch(extensionType) {
                case EXTENSION_TAG_EXIF:
                    metadataExtractor.parseRawExif(stream, extensionLength, true);
                    break;
                case EXTENSION_TAG_XMP:
                    handleXMP(stream, extensionLength, metadataExtractor);
                    break;
                default:
                    stream.skip(extensionLength);
            }
            extensionsDataSeen += extensionLength;
        }
    }
    // HEVC Header + Data
    // Alpha HEVC Header + Data
    // We can't do anything with these parts
    // We don't have any helpful text, sorry...
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3