Search in sources :

Example 46 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class FeedParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // set the encoding?
    try {
        SyndFeed feed = new SyndFeedInput().build(new InputSource(new CloseShieldInputStream(stream)));
        String title = stripTags(feed.getTitleEx());
        String description = stripTags(feed.getDescriptionEx());
        metadata.set(TikaCoreProperties.TITLE, title);
        metadata.set(TikaCoreProperties.DESCRIPTION, description);
        // store the other fields in the metadata
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.element("h1", title);
        xhtml.element("p", description);
        xhtml.startElement("ul");
        for (Object e : feed.getEntries()) {
            SyndEntry entry = (SyndEntry) e;
            String link = entry.getLink();
            if (link != null) {
                xhtml.startElement("li");
                xhtml.startElement("a", "href", link);
                xhtml.characters(stripTags(entry.getTitleEx()));
                xhtml.endElement("a");
                SyndContent content = entry.getDescription();
                if (content != null) {
                    xhtml.newline();
                    xhtml.characters(stripTags(content));
                }
                xhtml.endElement("li");
            }
        }
        xhtml.endElement("ul");
        xhtml.endDocument();
    } catch (FeedException e) {
        throw new TikaException("RSS parse error", e);
    }
}
Also used : SyndFeed(com.rometools.rome.feed.synd.SyndFeed) InputSource(org.xml.sax.InputSource) TikaException(org.apache.tika.exception.TikaException) SyndContent(com.rometools.rome.feed.synd.SyndContent) SyndFeedInput(com.rometools.rome.io.SyndFeedInput) SyndEntry(com.rometools.rome.feed.synd.SyndEntry) FeedException(com.rometools.rome.io.FeedException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 47 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class AdobeFontMetricParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    FontMetrics fontMetrics;
    AFMParser parser = new AFMParser(stream);
    // Have FontBox process the file
    fontMetrics = parser.parse();
    // Get the comments in the file to display in xhtml
    List<String> unModifiableComments = fontMetrics.getComments();
    //have to copy because we modify list in extractCreationDate
    List<String> comments = new ArrayList<>();
    for (String comment : unModifiableComments) {
        comments.add(comment);
    }
    // Get the creation date
    extractCreationDate(metadata, comments);
    metadata.set(Metadata.CONTENT_TYPE, AFM_TYPE.toString());
    metadata.set(TikaCoreProperties.TITLE, fontMetrics.getFullName());
    // Add metadata associated with the font type
    addMetadataByString(metadata, MET_AVG_CHAR_WIDTH, Float.toString(fontMetrics.getAverageCharacterWidth()));
    addMetadataByString(metadata, MET_DOC_VERSION, Float.toString(fontMetrics.getAFMVersion()));
    addMetadataByString(metadata, MET_FONT_NAME, fontMetrics.getFontName());
    addMetadataByString(metadata, MET_FONT_FULL_NAME, fontMetrics.getFullName());
    addMetadataByString(metadata, MET_FONT_FAMILY_NAME, fontMetrics.getFamilyName());
    addMetadataByString(metadata, MET_FONT_VERSION, fontMetrics.getFontVersion());
    addMetadataByString(metadata, MET_FONT_WEIGHT, fontMetrics.getWeight());
    addMetadataByString(metadata, MET_FONT_NOTICE, fontMetrics.getNotice());
    addMetadataByString(metadata, MET_FONT_UNDERLINE_THICKNESS, Float.toString(fontMetrics.getUnderlineThickness()));
    // Output the remaining comments as text
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    // Display the comments
    if (comments.size() > 0) {
        xhtml.element("h1", "Comments");
        xhtml.startElement("div", "class", "comments");
        for (String comment : comments) {
            xhtml.element("p", comment);
        }
        xhtml.endElement("div");
    }
    xhtml.endDocument();
}
Also used : AFMParser(org.apache.fontbox.afm.AFMParser) FontMetrics(org.apache.fontbox.afm.FontMetrics) ArrayList(java.util.ArrayList) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 48 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class GDALParser method processOutput.

private void processOutput(ContentHandler handler, Metadata metadata, String output) throws SAXException, IOException {
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8));
    try (Reader reader = new InputStreamReader(stream, UTF_8)) {
        xhtml.startDocument();
        xhtml.startElement("p");
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            xhtml.characters(buffer, 0, n);
        }
        xhtml.endElement("p");
    } finally {
        xhtml.endDocument();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 49 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class ICNSParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    byte[] header = new byte[4];
    // Extract magic byte
    IOUtils.readFully(stream, header, 0, 4);
    if (header[0] == (byte) 'i' && header[1] == (byte) 'c' && header[2] == (byte) 'n' && header[3] == (byte) 's') {
    // Good, signature found
    } else {
        throw new TikaException("ICNS magic signature invalid");
    }
    //Extract image size/length of bytes in file
    IOUtils.readFully(stream, header, 0, 4);
    int image_length = java.nio.ByteBuffer.wrap(header).getInt();
    byte[] full_file = new byte[image_length];
    IOUtils.readFully(stream, full_file);
    ArrayList<ICNSType> icons = new ArrayList<>();
    ArrayList<ICNSType> icon_masks = new ArrayList<>();
    byte[] tempByteArray = new byte[4];
    for (int offset = 0; offset < image_length - 8; ) {
        //Read the ResType/OSTYpe identifier for sub-icon
        tempByteArray[0] = full_file[offset];
        tempByteArray[1] = full_file[offset + 1];
        tempByteArray[2] = full_file[offset + 2];
        tempByteArray[3] = full_file[offset + 3];
        ICNSType icnstype = findIconType(tempByteArray);
        if (icnstype == null) {
            //exit out of loop
            //No more icons left
            offset = image_length - 8;
        } else if (icnstype.hasMask() == true) {
            icon_masks.add(findIconType(tempByteArray));
        } else {
            icons.add(findIconType(tempByteArray));
        }
        //Read the sub-icon length
        tempByteArray[0] = full_file[offset + 4];
        tempByteArray[1] = full_file[offset + 5];
        tempByteArray[2] = full_file[offset + 6];
        tempByteArray[3] = full_file[offset + 7];
        int icon_length = java.nio.ByteBuffer.wrap(tempByteArray).getInt();
        offset = offset + icon_length;
    }
    String icon_details = "", iconmask_details = "", bitsPerPixel, dimensions;
    for (ICNSType icon : icons) {
        bitsPerPixel = (icon.getBitsPerPixel() != 0) ? icon.getBitsPerPixel() + " bpp" : "JPEG 2000 or PNG format";
        dimensions = (!icon.hasRetinaDisplay()) ? (icon.getHeight() + "x" + icon.getWidth()) : (icon.getHeight() + "x" + icon.getWidth() + "@2X");
        icon_details = icon_details + ", " + dimensions + " (" + bitsPerPixel + ")";
    }
    for (ICNSType icon : icon_masks) {
        iconmask_details = iconmask_details + ", " + icon.getHeight() + "x" + icon.getWidth() + " (" + icon.getBitsPerPixel() + " bpp" + ")";
    }
    metadata.set(Metadata.CONTENT_TYPE, ICNS_MIME_TYPE);
    if (!icon_details.equals("")) {
        metadata.set("Icon count", String.valueOf(icons.size()));
        icon_details = icon_details.substring(2);
        metadata.set("Icon details", icon_details);
    }
    if (!iconmask_details.equals("")) {
        metadata.set("Masked icon count", String.valueOf(icon_masks.size()));
        iconmask_details = iconmask_details.substring(2);
        metadata.set("Masked icon details", iconmask_details);
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) ArrayList(java.util.ArrayList) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 50 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class ImageParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    String type = metadata.get(Metadata.CONTENT_TYPE);
    if (type != null) {
        //  fix it up to the new one, so Java is happy
        if (OLD_BMP_TYPE.toString().equals(type)) {
            type = MAIN_BMP_TYPE.toString();
        }
        try {
            Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type);
            if (iterator.hasNext()) {
                ImageReader reader = iterator.next();
                try {
                    try (ImageInputStream imageStream = ImageIO.createImageInputStream(new CloseShieldInputStream(stream))) {
                        reader.setInput(imageStream);
                        metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
                        metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
                        metadata.set("height", Integer.toString(reader.getHeight(0)));
                        metadata.set("width", Integer.toString(reader.getWidth(0)));
                        loadMetadata(reader.getImageMetadata(0), metadata);
                    }
                } finally {
                    reader.dispose();
                }
            }
            // Translate certain Metadata tags from the ImageIO
            //  specific namespace into the general Tika one
            setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
        } catch (IIOException e) {
            //  which Tika will just ignore.
            if (!(e.getMessage() != null && e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) {
                throw new TikaException(type + " parse error", e);
            }
        }
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) ImageInputStream(javax.imageio.stream.ImageInputStream) IIOException(javax.imageio.IIOException) ImageReader(javax.imageio.ImageReader) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3