use of org.xml.sax.helpers.AttributesImpl in project tika by apache.
the class PDF2XHTML method extractImages.
private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
if (resources == null || config.getExtractInlineImages() == false) {
return;
}
for (COSName name : resources.getXObjectNames()) {
PDXObject object = null;
try {
object = resources.getXObject(name);
} catch (MissingImageReaderException e) {
EmbeddedDocumentUtil.recordException(e, metadata);
continue;
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
continue;
}
if (object == null) {
continue;
}
COSStream cosStream = object.getCOSObject();
if (seenThisPage.contains(cosStream)) {
//avoid infinite recursion TIKA-1742
continue;
}
seenThisPage.add(cosStream);
if (object instanceof PDFormXObject) {
extractImages(((PDFormXObject) object).getResources(), seenThisPage);
} else if (object instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject) object;
Metadata embeddedMetadata = new Metadata();
String extension = image.getSuffix();
if (extension == null || extension.equals("png")) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
extension = "png";
} else if (extension.equals("jpg")) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
} else if (extension.equals("tiff")) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
extension = "tif";
} else if (extension.equals("jpx")) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
} else if (extension.equals("jb2")) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/x-jbig2");
} else {
//TODO: determine if we need to add more image types
// throw new RuntimeException("EXTEN:" + extension);
}
Integer imageNumber = processedInlineImages.get(cosStream);
if (imageNumber == null) {
imageNumber = inlineImageCounter++;
}
String fileName = "image" + imageNumber + "." + extension;
embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
// Output the img tag
AttributesImpl attr = new AttributesImpl();
attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
attr.addAttribute("", "alt", "alt", "CDATA", fileName);
xhtml.startElement("img", attr);
xhtml.endElement("img");
//If so, have we already processed this one?
if (config.getExtractUniqueInlineImagesOnly() == true) {
if (processedInlineImages.containsKey(cosStream)) {
continue;
}
processedInlineImages.put(cosStream, imageNumber);
}
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
try {
//TODO: handle image.getMetadata()?
try {
writeToBuffer(image, extension, buffer);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
continue;
}
try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
embeddedDocumentExtractor.parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), embeddedMetadata, false);
}
} catch (IOException e) {
handleCatchableIOE(e);
}
}
}
}
}
use of org.xml.sax.helpers.AttributesImpl in project tika by apache.
the class PackageParser method handleEntryMetadata.
protected static Metadata handleEntryMetadata(String name, Date createAt, Date modifiedAt, Long size, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
Metadata entrydata = new Metadata();
if (createAt != null) {
entrydata.set(TikaCoreProperties.CREATED, createAt);
}
if (modifiedAt != null) {
entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
}
if (size != null) {
entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
}
if (name != null && name.length() > 0) {
name = name.replace("\\", "/");
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", name);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
}
return entrydata;
}
use of org.xml.sax.helpers.AttributesImpl in project sling by apache.
the class SimpleXmlSerializationManager method buildSerializationData.
@Override
public SerializationData buildSerializationData(File contentSyncRoot, ResourceProxy resource) throws SerializationException {
if (resource == null) {
return null;
}
Map<String, Object> content = resource.getProperties();
if (content == null || content.isEmpty()) {
return null;
}
try {
SAXTransformerFactory f = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
ByteArrayOutputStream result = new ByteArrayOutputStream();
StreamResult sr = new StreamResult(result);
TransformerHandler handler = f.newTransformerHandler();
Transformer t = handler.getTransformer();
t.setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(sr);
handler.startDocument();
startElement(handler, TAG_RESOURCE);
Set<Entry<String, Object>> entrySet = new TreeMap<>(content).entrySet();
for (Map.Entry<String, Object> property : entrySet) {
Object value = property.getValue();
if (value instanceof String) {
String tagName = property.getKey();
String tagValue = (String) value;
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", ATT_PROPERTY_NAME, ATT_PROPERTY_NAME, null, tagName);
handler.startElement("", TAG_PROPERTY, TAG_PROPERTY, attributes);
handler.characters(tagValue.toCharArray(), 0, tagValue.length());
handler.endElement("", TAG_PROPERTY, TAG_PROPERTY);
} else {
// TODO multi-valued properties, other primitives
System.err.println("Can't yet handle property " + property.getKey() + " of type " + value.getClass());
}
}
endElement(handler, TAG_RESOURCE);
handler.endDocument();
// TODO - also add the serialization type
return new SerializationData(resource.getPath(), CONTENT_XML, result.toByteArray(), null);
} catch (TransformerConfigurationException | TransformerFactoryConfigurationError | SAXException e) {
// TODO proper exception handling
throw new RuntimeException(e);
}
}
use of org.xml.sax.helpers.AttributesImpl in project tika by apache.
the class ParsingEmbeddedDocumentExtractor method parseEmbedded.
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
} catch (EncryptedDocumentException ede) {
// TODO: can we log a warning that we lack the password?
// For now, just skip the content
} catch (TikaException e) {
// TODO: can we log a warning somehow?
// Could not parse the entry, just skip the content
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
use of org.xml.sax.helpers.AttributesImpl in project tika by apache.
the class ContentHandlerResource method internalProcess.
private void internalProcess(DataInputStream input) throws IOException, SAXException {
int type = input.readUnsignedByte();
if (type == ContentHandlerProxy.START_DOCUMENT) {
handler.startDocument();
} else if (type == ContentHandlerProxy.END_DOCUMENT) {
handler.endDocument();
} else if (type == ContentHandlerProxy.START_PREFIX_MAPPING) {
handler.startPrefixMapping(readString(input), readString(input));
} else if (type == ContentHandlerProxy.END_PREFIX_MAPPING) {
handler.endPrefixMapping(readString(input));
} else if (type == ContentHandlerProxy.START_ELEMENT) {
String uri = readString(input);
String localName = readString(input);
String qName = readString(input);
AttributesImpl atts = null;
int n = input.readInt();
if (n >= 0) {
atts = new AttributesImpl();
for (int i = 0; i < n; i++) {
atts.addAttribute(readString(input), readString(input), readString(input), readString(input), readString(input));
}
}
handler.startElement(uri, localName, qName, atts);
} else if (type == ContentHandlerProxy.END_ELEMENT) {
String uri = readString(input);
String localName = readString(input);
String qName = readString(input);
handler.endElement(uri, localName, qName);
} else if (type == ContentHandlerProxy.CHARACTERS) {
char[] ch = readCharacters(input);
handler.characters(ch, 0, ch.length);
} else if (type == ContentHandlerProxy.IGNORABLE_WHITESPACE) {
char[] ch = readCharacters(input);
handler.characters(ch, 0, ch.length);
} else if (type == ContentHandlerProxy.PROCESSING_INSTRUCTION) {
handler.processingInstruction(readString(input), readString(input));
} else if (type == ContentHandlerProxy.SKIPPED_ENTITY) {
handler.skippedEntity(readString(input));
}
}
Aggregations