Search in sources :

Example 11 with TeeContentHandler

use of org.apache.tika.sax.TeeContentHandler in project tika by apache.

the class LanguageDetectingParser method parse.

public void parse(InputStream stream, ContentHandler handler, final Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException {
    LanguageHandler langHandler = new LanguageHandler();
    ContentHandler tee = new TeeContentHandler(handler, langHandler);
    super.parse(stream, tee, metadata, context);
    LanguageResult result = langHandler.getLanguage();
    if (result.isReasonablyCertain()) {
        metadata.set(TikaCoreProperties.LANGUAGE, result.getLanguage());
    }
}
Also used : LanguageHandler(org.apache.tika.language.detect.LanguageHandler) LanguageResult(org.apache.tika.language.detect.LanguageResult) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 12 with TeeContentHandler

use of org.apache.tika.sax.TeeContentHandler in project tika by apache.

the class ForkParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    if (stream == null) {
        throw new NullPointerException("null stream");
    }
    Throwable t;
    boolean alive = false;
    ForkClient client = acquireClient();
    try {
        ContentHandler tee = new TeeContentHandler(handler, new MetadataContentHandler(metadata));
        t = client.call("parse", stream, tee, metadata, context);
        alive = true;
    } catch (TikaException te) {
        // Problem occurred on our side
        alive = true;
        throw te;
    } catch (IOException e) {
        // Problem occurred on the other side
        throw new TikaException("Failed to communicate with a forked parser process." + " The process has most likely crashed due to some error" + " like running out of memory. A new process will be" + " started for the next parsing request.", e);
    } finally {
        releaseClient(client, alive);
    }
    if (t instanceof IOException) {
        throw (IOException) t;
    } else if (t instanceof SAXException) {
        throw (SAXException) t;
    } else if (t instanceof TikaException) {
        throw (TikaException) t;
    } else if (t != null) {
        throw new TikaException("Unexpected error in forked server process", t);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ContentHandler(org.xml.sax.ContentHandler) SAXException(org.xml.sax.SAXException)

Example 13 with TeeContentHandler

use of org.apache.tika.sax.TeeContentHandler in project tika by apache.

the class TikaGUI method handleStream.

private void handleStream(InputStream input, Metadata md) throws Exception {
    StringWriter htmlBuffer = new StringWriter();
    StringWriter textBuffer = new StringWriter();
    StringWriter textMainBuffer = new StringWriter();
    StringWriter xmlBuffer = new StringWriter();
    StringBuilder metadataBuffer = new StringBuilder();
    ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
    context.set(DocumentSelector.class, new ImageDocumentSelector());
    input = TikaInputStream.get(new ProgressMonitorInputStream(this, "Parsing stream", input));
    if (input.markSupported()) {
        int mark = -1;
        if (input instanceof TikaInputStream) {
            if (((TikaInputStream) input).hasFile()) {
                mark = (int) ((TikaInputStream) input).getLength();
            }
        }
        if (mark == -1) {
            mark = MAX_MARK;
        }
        input.mark(mark);
    }
    parser.parse(input, handler, md, context);
    String[] names = md.names();
    Arrays.sort(names);
    for (String name : names) {
        for (String val : md.getValues(name)) {
            metadataBuffer.append(name);
            metadataBuffer.append(": ");
            metadataBuffer.append(val);
            metadataBuffer.append("\n");
        }
    }
    String name = md.get(Metadata.RESOURCE_NAME_KEY);
    if (name != null && name.length() > 0) {
        setTitle("Apache Tika: " + name);
    } else {
        setTitle("Apache Tika: unnamed document");
    }
    setText(metadata, metadataBuffer.toString());
    setText(xml, xmlBuffer.toString());
    setText(text, textBuffer.toString());
    setText(textMain, textMainBuffer.toString());
    setText(html, htmlBuffer.toString());
    if (!input.markSupported()) {
        setText(json, "InputStream does not support mark/reset for Recursive Parsing");
        layout.show(cards, "metadata");
        return;
    }
    boolean isReset = false;
    try {
        input.reset();
        isReset = true;
    } catch (IOException e) {
        setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + "Try the app with command line argument of -J.");
    }
    if (isReset) {
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
        wrapper.parse(input, null, new Metadata(), new ParseContext());
        StringWriter jsonBuffer = new StringWriter();
        JsonMetadataList.setPrettyPrinting(true);
        JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
        setText(json, jsonBuffer.toString());
    }
    layout.show(cards, "metadata");
}
Also used : ProgressMonitorInputStream(javax.swing.ProgressMonitorInputStream) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) StringWriter(java.io.StringWriter) ParseContext(org.apache.tika.parser.ParseContext) TeeContentHandler(org.apache.tika.sax.TeeContentHandler)

Aggregations

TeeContentHandler (org.apache.tika.sax.TeeContentHandler)13 ContentHandler (org.xml.sax.ContentHandler)12 Metadata (org.apache.tika.metadata.Metadata)4 ParseContext (org.apache.tika.parser.ParseContext)4 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 CompositeMatcher (org.apache.tika.sax.xpath.CompositeMatcher)4 Matcher (org.apache.tika.sax.xpath.Matcher)4 MatchingContentHandler (org.apache.tika.sax.xpath.MatchingContentHandler)4 IOException (java.io.IOException)3 InputStream (java.io.InputStream)3 TikaInputStream (org.apache.tika.io.TikaInputStream)3 AttributeMetadataHandler (org.apache.tika.parser.xml.AttributeMetadataHandler)3 ElementMetadataHandler (org.apache.tika.parser.xml.ElementMetadataHandler)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 StringWriter (java.io.StringWriter)2 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)2 Parser (org.apache.tika.parser.Parser)2 AttributeDependantMetadataHandler (org.apache.tika.parser.xml.AttributeDependantMetadataHandler)2 LinkContentHandler (org.apache.tika.sax.LinkContentHandler)2 ContentMetadataExtractor (ddf.catalog.content.operation.ContentMetadataExtractor)1