Search in sources :

Example 66 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class PooledTimeSeriesParser method parse.

/**
     * Parses a document stream into a sequence of XHTML SAX events. Fills in
     * related document metadata in the given metadata object.
     * <p>
     * The given document stream is consumed but not closed by this method. The
     * responsibility to close the stream remains on the caller.
     * <p>
     * Information about the parsing context can be passed in the context
     * parameter. See the parser implementations for the kinds of context
     * information they expect.
     *
     * @param stream   the document stream (input)
     * @param handler  handler for the XHTML SAX events (output)
     * @param metadata document metadata (input and output)
     * @param context  parse context
     * @throws IOException   if the document stream could not be read
     * @throws SAXException  if the SAX events could not be processed
     * @throws TikaException if the document could not be parsed
     * @since Apache Tika 0.5
     */
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    if (!isAvailable) {
        LOG.warn("PooledTimeSeries not installed!");
        return;
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        File input = tikaStream.getFile();
        String cmdOutput = computePoT(input);
        try (InputStream ofStream = new FileInputStream(new File(input.getAbsoluteFile() + ".of.txt"))) {
            try (InputStream ogStream = new FileInputStream(new File(input.getAbsoluteFile() + ".hog.txt"))) {
                extractHeaderOutput(ofStream, metadata, "of");
                extractHeaderOutput(ogStream, metadata, "og");
                xhtml.startDocument();
                doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)", metadata.get("of_frames"), metadata.get("of_vecSize"));
                doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)", metadata.get("og_frames"), metadata.get("og_vecSize"));
                xhtml.endDocument();
            }
        }
        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_VIDEO_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
    } finally {
        tmp.dispose();
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 67 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class PRTParser method parse.

/*
     * Text types:
     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
     *   (anything)  e0 3f sz sz TEXT    *view name*
     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    *note entries* 
     *   
     *  Note - all text is null terminated
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    Last5 l5 = new Last5();
    int read;
    // Try to get the creation date, which is YYYYMMDDhhmm
    byte[] header = new byte[30];
    IOUtils.readFully(stream, header);
    byte[] date = new byte[12];
    IOUtils.readFully(stream, date);
    String dateStr = new String(date, US_ASCII);
    if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
        String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" + dateStr.substring(10, 12) + ":00";
        metadata.set(TikaCoreProperties.CREATED, formattedDate);
        // TODO Metadata.DATE is used as modified, should it be here?
        metadata.set(Metadata.DATE, formattedDate);
    }
    metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
    // The description, if set, is the next up-to-500 bytes
    byte[] desc = new byte[500];
    IOUtils.readFully(stream, desc);
    String description = extractText(desc, true);
    if (description.length() > 0) {
        metadata.set(TikaCoreProperties.DESCRIPTION, description);
    }
    // Now look for text
    while ((read = stream.read()) > -1) {
        if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
            int nread = stream.read();
            if (nread == 0x3f || nread == 0xbf) {
                // Looks promising, check back for a suitable value
                if (read == 0xe3 && nread == 0x3f) {
                    if (l5.is33()) {
                        // Bingo, note text
                        handleNoteText(stream, xhtml);
                    }
                } else if (l5.is00()) {
                    // Likely view name
                    handleViewName(read, nread, stream, xhtml, l5);
                }
            }
        } else {
            l5.record(read);
        }
    }
}
Also used : XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 68 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class QuattroProParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    if (metadata.get(Metadata.CONTENT_TYPE) == null) {
        metadata.set(Metadata.CONTENT_TYPE, QP_9.toString());
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    QPWTextExtractor extractor = new QPWTextExtractor();
    extractor.extract(stream, xhtml, metadata);
    xhtml.endDocument();
}
Also used : XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 69 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class WordPerfectParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    WPInputStream wpStream = new WPInputStream(stream);
    WPPrefixArea prefixArea = WPPrefixAreaExtractor.extract(wpStream);
    ensureFileSupport(prefixArea, metadata);
    applyMetadata(prefixArea, metadata);
    extractDocumentArea(prefixArea, wpStream, new XHTMLContentHandler(handler, metadata));
}
Also used : XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 70 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class ForkTestParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    stream.read();
    metadata.set(Metadata.CONTENT_TYPE, "text/plain");
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    char[] ch = "Hello, World!".toCharArray();
    xhtml.characters(ch, 0, ch.length);
    xhtml.endDocument();
}
Also used : XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3