use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class PooledTimeSeriesParser method parse.
/**
* Parses a document stream into a sequence of XHTML SAX events. Fills in
* related document metadata in the given metadata object.
* <p>
* The given document stream is consumed but not closed by this method. The
* responsibility to close the stream remains on the caller.
* <p>
* Information about the parsing context can be passed in the context
* parameter. See the parser implementations for the kinds of context
* information they expect.
*
* @param stream the document stream (input)
* @param handler handler for the XHTML SAX events (output)
* @param metadata document metadata (input and output)
* @param context parse context
* @throws IOException if the document stream could not be read
* @throws SAXException if the SAX events could not be processed
* @throws TikaException if the document could not be parsed
* @since Apache Tika 0.5
*/
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (!isAvailable) {
LOG.warn("PooledTimeSeries not installed!");
return;
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File input = tikaStream.getFile();
String cmdOutput = computePoT(input);
try (InputStream ofStream = new FileInputStream(new File(input.getAbsoluteFile() + ".of.txt"))) {
try (InputStream ogStream = new FileInputStream(new File(input.getAbsoluteFile() + ".hog.txt"))) {
extractHeaderOutput(ofStream, metadata, "of");
extractHeaderOutput(ogStream, metadata, "og");
xhtml.startDocument();
doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)", metadata.get("of_frames"), metadata.get("of_vecSize"));
doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)", metadata.get("og_frames"), metadata.get("og_vecSize"));
xhtml.endDocument();
}
}
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_VIDEO_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class PRTParser method parse.
/*
* Text types:
* 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
* 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
* (anything) e0 3f sz sz TEXT *view name*
* 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
*
* Note - all text is null terminated
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
Last5 l5 = new Last5();
int read;
// Try to get the creation date, which is YYYYMMDDhhmm
byte[] header = new byte[30];
IOUtils.readFully(stream, header);
byte[] date = new byte[12];
IOUtils.readFully(stream, date);
String dateStr = new String(date, US_ASCII);
if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" + dateStr.substring(10, 12) + ":00";
metadata.set(TikaCoreProperties.CREATED, formattedDate);
// TODO Metadata.DATE is used as modified, should it be here?
metadata.set(Metadata.DATE, formattedDate);
}
metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
// The description, if set, is the next up-to-500 bytes
byte[] desc = new byte[500];
IOUtils.readFully(stream, desc);
String description = extractText(desc, true);
if (description.length() > 0) {
metadata.set(TikaCoreProperties.DESCRIPTION, description);
}
// Now look for text
while ((read = stream.read()) > -1) {
if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
int nread = stream.read();
if (nread == 0x3f || nread == 0xbf) {
// Looks promising, check back for a suitable value
if (read == 0xe3 && nread == 0x3f) {
if (l5.is33()) {
// Bingo, note text
handleNoteText(stream, xhtml);
}
} else if (l5.is00()) {
// Likely view name
handleViewName(read, nread, stream, xhtml, l5);
}
}
} else {
l5.record(read);
}
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class QuattroProParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, QP_9.toString());
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
QPWTextExtractor extractor = new QPWTextExtractor();
extractor.extract(stream, xhtml, metadata);
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class WordPerfectParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
WPInputStream wpStream = new WPInputStream(stream);
WPPrefixArea prefixArea = WPPrefixAreaExtractor.extract(wpStream);
ensureFileSupport(prefixArea, metadata);
applyMetadata(prefixArea, metadata);
extractDocumentArea(prefixArea, wpStream, new XHTMLContentHandler(handler, metadata));
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class ForkTestParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
stream.read();
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
char[] ch = "Hello, World!".toCharArray();
xhtml.characters(ch, 0, ch.length);
xhtml.endDocument();
}
Aggregations