use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class EpubParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Because an EPub file is often made up of multiple XHTML files,
// we need explicit control over the start and end of the document
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));
ZipInputStream zip = new ZipInputStream(stream);
ZipEntry entry = zip.getNextEntry();
while (entry != null) {
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
//often has trailing new lines
if (type != null) {
type = type.trim();
}
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".opf")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) {
content.parse(zip, childHandler, metadata, context);
}
entry = zip.getNextEntry();
}
// Finish everything
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class ExecutableParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// We only do metadata, for now
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// What kind is it?
byte[] first4 = new byte[4];
IOUtils.readFully(stream, first4);
if (first4[0] == (byte) 'M' && first4[1] == (byte) 'Z') {
parsePE(xhtml, metadata, stream, first4);
} else if (first4[0] == (byte) 0x7f && first4[1] == (byte) 'E' && first4[2] == (byte) 'L' && first4[3] == (byte) 'F') {
parseELF(xhtml, metadata, stream, first4);
}
// Finish everything
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class TrueTypeParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TikaInputStream tis = TikaInputStream.cast(stream);
// Ask FontBox to parse the file for us
TrueTypeFont font = null;
try {
TTFParser parser = new TTFParser();
if (tis != null && tis.hasFile()) {
font = parser.parse(tis.getFile());
} else {
font = parser.parse(stream);
}
// Report the details of the font
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
metadata.set(TikaCoreProperties.CREATED, font.getHeader().getCreated());
metadata.set(TikaCoreProperties.MODIFIED, font.getHeader().getModified());
metadata.set(AdobeFontMetricParser.MET_DOC_VERSION, Float.toString(font.getHeader().getVersion()));
// Pull out the naming info
NamingTable fontNaming = font.getNaming();
for (NameRecord nr : fontNaming.getNameRecords()) {
if (nr.getNameId() == NameRecord.NAME_FONT_FAMILY_NAME) {
metadata.set(AdobeFontMetricParser.MET_FONT_FAMILY_NAME, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_FONT_SUB_FAMILY_NAME) {
metadata.set(AdobeFontMetricParser.MET_FONT_SUB_FAMILY_NAME, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_FULL_FONT_NAME) {
metadata.set(AdobeFontMetricParser.MET_FONT_NAME, nr.getString());
metadata.set(TikaCoreProperties.TITLE, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_POSTSCRIPT_NAME) {
metadata.set(AdobeFontMetricParser.MET_PS_NAME, nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_COPYRIGHT) {
metadata.set("Copyright", nr.getString());
}
if (nr.getNameId() == NameRecord.NAME_TRADEMARK) {
metadata.set("Trademark", nr.getString());
}
}
} finally {
if (font != null) {
font.close();
}
}
// For now, we only output metadata, no textual contents
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class DBFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
DBFReader reader = DBFReader.open(stream);
DBFFileHeader header = reader.getHeader();
metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString());
//insert metadata here
Calendar lastModified = header.getLastModified();
if (lastModified != null) {
metadata.set(TikaCoreProperties.MODIFIED, lastModified);
}
//buffer first X rows for charset detection
List<DBFRow> firstRows = new LinkedList<>();
DBFRow row = reader.next();
int i = 0;
while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) {
firstRows.add(row.deepCopy());
row = reader.next();
}
Charset charset = getCharset(firstRows, header);
metadata.set(Metadata.CONTENT_ENCODING, charset.toString());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("table");
xhtml.startElement("thead");
for (DBFColumnHeader col : header.getCols()) {
xhtml.startElement("th");
xhtml.characters(col.getName(charset));
xhtml.endElement("th");
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
//now write cached rows
while (firstRows.size() > 0) {
DBFRow cachedRow = firstRows.remove(0);
writeRow(cachedRow, charset, xhtml);
}
//now continue with rest
while (row != null) {
writeRow(row, charset, xhtml);
row = reader.next();
}
xhtml.endElement("tbody");
xhtml.endElement("table");
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class DWGParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
// First up, which version of the format are we handling?
byte[] header = new byte[128];
IOUtils.readFully(stream, header);
String version = new String(header, 0, 6, "US-ASCII");
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
if (version.equals("AC1015")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipTo2000PropertyInfoSection(stream, header)) {
get2000Props(stream, metadata, xhtml);
}
} else if (version.equals("AC1018")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipToPropertyInfoSection(stream, header)) {
get2004Props(stream, metadata, xhtml);
}
} else if (version.equals("AC1021") || version.equals("AC1024")) {
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipToPropertyInfoSection(stream, header)) {
get2007and2010Props(stream, metadata, xhtml);
}
} else {
throw new TikaException("Unsupported AutoCAD drawing version: " + version);
}
xhtml.endDocument();
}
Aggregations