use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class FeedParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// set the encoding?
try {
SyndFeed feed = new SyndFeedInput().build(new InputSource(new CloseShieldInputStream(stream)));
String title = stripTags(feed.getTitleEx());
String description = stripTags(feed.getDescriptionEx());
metadata.set(TikaCoreProperties.TITLE, title);
metadata.set(TikaCoreProperties.DESCRIPTION, description);
// store the other fields in the metadata
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.element("h1", title);
xhtml.element("p", description);
xhtml.startElement("ul");
for (Object e : feed.getEntries()) {
SyndEntry entry = (SyndEntry) e;
String link = entry.getLink();
if (link != null) {
xhtml.startElement("li");
xhtml.startElement("a", "href", link);
xhtml.characters(stripTags(entry.getTitleEx()));
xhtml.endElement("a");
SyndContent content = entry.getDescription();
if (content != null) {
xhtml.newline();
xhtml.characters(stripTags(content));
}
xhtml.endElement("li");
}
}
xhtml.endElement("ul");
xhtml.endDocument();
} catch (FeedException e) {
throw new TikaException("RSS parse error", e);
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class AdobeFontMetricParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
FontMetrics fontMetrics;
AFMParser parser = new AFMParser(stream);
// Have FontBox process the file
fontMetrics = parser.parse();
// Get the comments in the file to display in xhtml
List<String> unModifiableComments = fontMetrics.getComments();
//have to copy because we modify list in extractCreationDate
List<String> comments = new ArrayList<>();
for (String comment : unModifiableComments) {
comments.add(comment);
}
// Get the creation date
extractCreationDate(metadata, comments);
metadata.set(Metadata.CONTENT_TYPE, AFM_TYPE.toString());
metadata.set(TikaCoreProperties.TITLE, fontMetrics.getFullName());
// Add metadata associated with the font type
addMetadataByString(metadata, MET_AVG_CHAR_WIDTH, Float.toString(fontMetrics.getAverageCharacterWidth()));
addMetadataByString(metadata, MET_DOC_VERSION, Float.toString(fontMetrics.getAFMVersion()));
addMetadataByString(metadata, MET_FONT_NAME, fontMetrics.getFontName());
addMetadataByString(metadata, MET_FONT_FULL_NAME, fontMetrics.getFullName());
addMetadataByString(metadata, MET_FONT_FAMILY_NAME, fontMetrics.getFamilyName());
addMetadataByString(metadata, MET_FONT_VERSION, fontMetrics.getFontVersion());
addMetadataByString(metadata, MET_FONT_WEIGHT, fontMetrics.getWeight());
addMetadataByString(metadata, MET_FONT_NOTICE, fontMetrics.getNotice());
addMetadataByString(metadata, MET_FONT_UNDERLINE_THICKNESS, Float.toString(fontMetrics.getUnderlineThickness()));
// Output the remaining comments as text
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// Display the comments
if (comments.size() > 0) {
xhtml.element("h1", "Comments");
xhtml.startElement("div", "class", "comments");
for (String comment : comments) {
xhtml.element("p", comment);
}
xhtml.endElement("div");
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class GDALParser method processOutput.
private void processOutput(ContentHandler handler, Metadata metadata, String output) throws SAXException, IOException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
InputStream stream = new ByteArrayInputStream(output.getBytes(UTF_8));
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
xhtml.characters(buffer, 0, n);
}
xhtml.endElement("p");
} finally {
xhtml.endDocument();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class ICNSParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
byte[] header = new byte[4];
// Extract magic byte
IOUtils.readFully(stream, header, 0, 4);
if (header[0] == (byte) 'i' && header[1] == (byte) 'c' && header[2] == (byte) 'n' && header[3] == (byte) 's') {
// Good, signature found
} else {
throw new TikaException("ICNS magic signature invalid");
}
//Extract image size/length of bytes in file
IOUtils.readFully(stream, header, 0, 4);
int image_length = java.nio.ByteBuffer.wrap(header).getInt();
byte[] full_file = new byte[image_length];
IOUtils.readFully(stream, full_file);
ArrayList<ICNSType> icons = new ArrayList<>();
ArrayList<ICNSType> icon_masks = new ArrayList<>();
byte[] tempByteArray = new byte[4];
for (int offset = 0; offset < image_length - 8; ) {
//Read the ResType/OSTYpe identifier for sub-icon
tempByteArray[0] = full_file[offset];
tempByteArray[1] = full_file[offset + 1];
tempByteArray[2] = full_file[offset + 2];
tempByteArray[3] = full_file[offset + 3];
ICNSType icnstype = findIconType(tempByteArray);
if (icnstype == null) {
//exit out of loop
//No more icons left
offset = image_length - 8;
} else if (icnstype.hasMask() == true) {
icon_masks.add(findIconType(tempByteArray));
} else {
icons.add(findIconType(tempByteArray));
}
//Read the sub-icon length
tempByteArray[0] = full_file[offset + 4];
tempByteArray[1] = full_file[offset + 5];
tempByteArray[2] = full_file[offset + 6];
tempByteArray[3] = full_file[offset + 7];
int icon_length = java.nio.ByteBuffer.wrap(tempByteArray).getInt();
offset = offset + icon_length;
}
String icon_details = "", iconmask_details = "", bitsPerPixel, dimensions;
for (ICNSType icon : icons) {
bitsPerPixel = (icon.getBitsPerPixel() != 0) ? icon.getBitsPerPixel() + " bpp" : "JPEG 2000 or PNG format";
dimensions = (!icon.hasRetinaDisplay()) ? (icon.getHeight() + "x" + icon.getWidth()) : (icon.getHeight() + "x" + icon.getWidth() + "@2X");
icon_details = icon_details + ", " + dimensions + " (" + bitsPerPixel + ")";
}
for (ICNSType icon : icon_masks) {
iconmask_details = iconmask_details + ", " + icon.getHeight() + "x" + icon.getWidth() + " (" + icon.getBitsPerPixel() + " bpp" + ")";
}
metadata.set(Metadata.CONTENT_TYPE, ICNS_MIME_TYPE);
if (!icon_details.equals("")) {
metadata.set("Icon count", String.valueOf(icons.size()));
icon_details = icon_details.substring(2);
metadata.set("Icon details", icon_details);
}
if (!iconmask_details.equals("")) {
metadata.set("Masked icon count", String.valueOf(icon_masks.size()));
iconmask_details = iconmask_details.substring(2);
metadata.set("Masked icon details", iconmask_details);
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class ImageParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
String type = metadata.get(Metadata.CONTENT_TYPE);
if (type != null) {
// fix it up to the new one, so Java is happy
if (OLD_BMP_TYPE.toString().equals(type)) {
type = MAIN_BMP_TYPE.toString();
}
try {
Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type);
if (iterator.hasNext()) {
ImageReader reader = iterator.next();
try {
try (ImageInputStream imageStream = ImageIO.createImageInputStream(new CloseShieldInputStream(stream))) {
reader.setInput(imageStream);
metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
metadata.set("height", Integer.toString(reader.getHeight(0)));
metadata.set("width", Integer.toString(reader.getWidth(0)));
loadMetadata(reader.getImageMetadata(0), metadata);
}
} finally {
reader.dispose();
}
}
// Translate certain Metadata tags from the ImageIO
// specific namespace into the general Tika one
setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
} catch (IIOException e) {
// which Tika will just ignore.
if (!(e.getMessage() != null && e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) {
throw new TikaException(type + " parse error", e);
}
}
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
Aggregations