Search in sources :

Example 96 with AttributesImpl

use of org.xml.sax.helpers.AttributesImpl in project tika by apache.

the class PDF2XHTML method extractImages.

private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
    if (resources == null || config.getExtractInlineImages() == false) {
        return;
    }
    for (COSName name : resources.getXObjectNames()) {
        PDXObject object = null;
        try {
            object = resources.getXObject(name);
        } catch (MissingImageReaderException e) {
            EmbeddedDocumentUtil.recordException(e, metadata);
            continue;
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
            continue;
        }
        if (object == null) {
            continue;
        }
        COSStream cosStream = object.getCOSObject();
        if (seenThisPage.contains(cosStream)) {
            //avoid infinite recursion TIKA-1742
            continue;
        }
        seenThisPage.add(cosStream);
        if (object instanceof PDFormXObject) {
            extractImages(((PDFormXObject) object).getResources(), seenThisPage);
        } else if (object instanceof PDImageXObject) {
            PDImageXObject image = (PDImageXObject) object;
            Metadata embeddedMetadata = new Metadata();
            String extension = image.getSuffix();
            if (extension == null || extension.equals("png")) {
                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
                extension = "png";
            } else if (extension.equals("jpg")) {
                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
            } else if (extension.equals("tiff")) {
                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
                extension = "tif";
            } else if (extension.equals("jpx")) {
                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
            } else if (extension.equals("jb2")) {
                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/x-jbig2");
            } else {
            //TODO: determine if we need to add more image types
            //                    throw new RuntimeException("EXTEN:" + extension);
            }
            Integer imageNumber = processedInlineImages.get(cosStream);
            if (imageNumber == null) {
                imageNumber = inlineImageCounter++;
            }
            String fileName = "image" + imageNumber + "." + extension;
            embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
            // Output the img tag
            AttributesImpl attr = new AttributesImpl();
            attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
            attr.addAttribute("", "alt", "alt", "CDATA", fileName);
            xhtml.startElement("img", attr);
            xhtml.endElement("img");
            //If so, have we already processed this one?
            if (config.getExtractUniqueInlineImagesOnly() == true) {
                if (processedInlineImages.containsKey(cosStream)) {
                    continue;
                }
                processedInlineImages.put(cosStream, imageNumber);
            }
            embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
            if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
                ByteArrayOutputStream buffer = new ByteArrayOutputStream();
                try {
                    //TODO: handle image.getMetadata()?
                    try {
                        writeToBuffer(image, extension, buffer);
                    } catch (IOException e) {
                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                        continue;
                    }
                    try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
                        embeddedDocumentExtractor.parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), embeddedMetadata, false);
                    }
                } catch (IOException e) {
                    handleCatchableIOE(e);
                }
            }
        }
    }
}
Also used : COSStream(org.apache.pdfbox.cos.COSStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) PDImageXObject(org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) MissingImageReaderException(org.apache.pdfbox.filter.MissingImageReaderException) AttributesImpl(org.xml.sax.helpers.AttributesImpl) COSName(org.apache.pdfbox.cos.COSName) PDFormXObject(org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject) PDXObject(org.apache.pdfbox.pdmodel.graphics.PDXObject)

Example 97 with AttributesImpl

use of org.xml.sax.helpers.AttributesImpl in project tika by apache.

the class PackageParser method handleEntryMetadata.

protected static Metadata handleEntryMetadata(String name, Date createAt, Date modifiedAt, Long size, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    Metadata entrydata = new Metadata();
    if (createAt != null) {
        entrydata.set(TikaCoreProperties.CREATED, createAt);
    }
    if (modifiedAt != null) {
        entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
    }
    if (size != null) {
        entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
    }
    if (name != null && name.length() > 0) {
        name = name.replace("\\", "/");
        entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
        attributes.addAttribute("", "id", "id", "CDATA", name);
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
        entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
    }
    return entrydata;
}
Also used : AttributesImpl(org.xml.sax.helpers.AttributesImpl) Metadata(org.apache.tika.metadata.Metadata)

Example 98 with AttributesImpl

use of org.xml.sax.helpers.AttributesImpl in project sling by apache.

the class SimpleXmlSerializationManager method buildSerializationData.

@Override
public SerializationData buildSerializationData(File contentSyncRoot, ResourceProxy resource) throws SerializationException {
    if (resource == null) {
        return null;
    }
    Map<String, Object> content = resource.getProperties();
    if (content == null || content.isEmpty()) {
        return null;
    }
    try {
        SAXTransformerFactory f = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
        ByteArrayOutputStream result = new ByteArrayOutputStream();
        StreamResult sr = new StreamResult(result);
        TransformerHandler handler = f.newTransformerHandler();
        Transformer t = handler.getTransformer();
        t.setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(sr);
        handler.startDocument();
        startElement(handler, TAG_RESOURCE);
        Set<Entry<String, Object>> entrySet = new TreeMap<>(content).entrySet();
        for (Map.Entry<String, Object> property : entrySet) {
            Object value = property.getValue();
            if (value instanceof String) {
                String tagName = property.getKey();
                String tagValue = (String) value;
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", ATT_PROPERTY_NAME, ATT_PROPERTY_NAME, null, tagName);
                handler.startElement("", TAG_PROPERTY, TAG_PROPERTY, attributes);
                handler.characters(tagValue.toCharArray(), 0, tagValue.length());
                handler.endElement("", TAG_PROPERTY, TAG_PROPERTY);
            } else {
                // TODO multi-valued properties, other primitives
                System.err.println("Can't yet handle property " + property.getKey() + " of type " + value.getClass());
            }
        }
        endElement(handler, TAG_RESOURCE);
        handler.endDocument();
        // TODO - also add the serialization type
        return new SerializationData(resource.getPath(), CONTENT_XML, result.toByteArray(), null);
    } catch (TransformerConfigurationException | TransformerFactoryConfigurationError | SAXException e) {
        // TODO proper exception handling
        throw new RuntimeException(e);
    }
}
Also used : TransformerFactoryConfigurationError(javax.xml.transform.TransformerFactoryConfigurationError) TransformerHandler(javax.xml.transform.sax.TransformerHandler) Transformer(javax.xml.transform.Transformer) StreamResult(javax.xml.transform.stream.StreamResult) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) ByteArrayOutputStream(java.io.ByteArrayOutputStream) SAXException(org.xml.sax.SAXException) Entry(java.util.Map.Entry) AttributesImpl(org.xml.sax.helpers.AttributesImpl) SerializationData(org.apache.sling.ide.serialization.SerializationData) HashMap(java.util.HashMap) Map(java.util.Map) TreeMap(java.util.TreeMap)

Example 99 with AttributesImpl

use of org.xml.sax.helpers.AttributesImpl in project tika by apache.

the class ParsingEmbeddedDocumentExtractor method parseEmbedded.

public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
    if (outputHtml) {
        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
        handler.startElement(XHTML, "div", "div", attributes);
    }
    String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
    if (name != null && name.length() > 0 && outputHtml) {
        handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
        char[] chars = name.toCharArray();
        handler.characters(chars, 0, chars.length);
        handler.endElement(XHTML, "h1", "h1");
    }
    // Use the delegate parser to parse this entry
    try (TemporaryResources tmp = new TemporaryResources()) {
        final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
        if (stream instanceof TikaInputStream) {
            final Object container = ((TikaInputStream) stream).getOpenContainer();
            if (container != null) {
                newStream.setOpenContainer(container);
            }
        }
        DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
    } catch (EncryptedDocumentException ede) {
    // TODO: can we log a warning that we lack the password?
    // For now, just skip the content
    } catch (TikaException e) {
    // TODO: can we log a warning somehow?
    // Could not parse the entry, just skip the content
    }
    if (outputHtml) {
        handler.endElement(XHTML, "div", "div");
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) AttributesImpl(org.xml.sax.helpers.AttributesImpl) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) CloseShieldInputStream(org.apache.tika.io.CloseShieldInputStream)

Example 100 with AttributesImpl

use of org.xml.sax.helpers.AttributesImpl in project tika by apache.

the class ContentHandlerResource method internalProcess.

private void internalProcess(DataInputStream input) throws IOException, SAXException {
    int type = input.readUnsignedByte();
    if (type == ContentHandlerProxy.START_DOCUMENT) {
        handler.startDocument();
    } else if (type == ContentHandlerProxy.END_DOCUMENT) {
        handler.endDocument();
    } else if (type == ContentHandlerProxy.START_PREFIX_MAPPING) {
        handler.startPrefixMapping(readString(input), readString(input));
    } else if (type == ContentHandlerProxy.END_PREFIX_MAPPING) {
        handler.endPrefixMapping(readString(input));
    } else if (type == ContentHandlerProxy.START_ELEMENT) {
        String uri = readString(input);
        String localName = readString(input);
        String qName = readString(input);
        AttributesImpl atts = null;
        int n = input.readInt();
        if (n >= 0) {
            atts = new AttributesImpl();
            for (int i = 0; i < n; i++) {
                atts.addAttribute(readString(input), readString(input), readString(input), readString(input), readString(input));
            }
        }
        handler.startElement(uri, localName, qName, atts);
    } else if (type == ContentHandlerProxy.END_ELEMENT) {
        String uri = readString(input);
        String localName = readString(input);
        String qName = readString(input);
        handler.endElement(uri, localName, qName);
    } else if (type == ContentHandlerProxy.CHARACTERS) {
        char[] ch = readCharacters(input);
        handler.characters(ch, 0, ch.length);
    } else if (type == ContentHandlerProxy.IGNORABLE_WHITESPACE) {
        char[] ch = readCharacters(input);
        handler.characters(ch, 0, ch.length);
    } else if (type == ContentHandlerProxy.PROCESSING_INSTRUCTION) {
        handler.processingInstruction(readString(input), readString(input));
    } else if (type == ContentHandlerProxy.SKIPPED_ENTITY) {
        handler.skippedEntity(readString(input));
    }
}
Also used : AttributesImpl(org.xml.sax.helpers.AttributesImpl)

Aggregations

AttributesImpl (org.xml.sax.helpers.AttributesImpl)310 SAXException (org.xml.sax.SAXException)53 Test (org.junit.Test)34 DiskWriteAttributesImpl (org.apache.geode.internal.cache.DiskWriteAttributesImpl)23 PartitionAttributesImpl (org.apache.geode.internal.cache.PartitionAttributesImpl)23 ContentHandler (org.xml.sax.ContentHandler)21 Attributes (org.xml.sax.Attributes)17 PreparedStatement (java.sql.PreparedStatement)16 ResultSet (java.sql.ResultSet)16 Map (java.util.Map)16 PackOut (org.adempiere.pipo.PackOut)16 IOException (java.io.IOException)15 POSaveFailedException (org.adempiere.pipo.exception.POSaveFailedException)12 Iterator (java.util.Iterator)11 TransformerHandler (javax.xml.transform.sax.TransformerHandler)11 StreamResult (javax.xml.transform.stream.StreamResult)11 Metadata (org.apache.tika.metadata.Metadata)11 File (java.io.File)9 SAXTransformerFactory (javax.xml.transform.sax.SAXTransformerFactory)9 DatabaseAccessException (org.adempiere.pipo.exception.DatabaseAccessException)9