Search in sources :

Example 6 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class TikaCLI method handleRecursiveJson.

private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
    Metadata metadata = new Metadata();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
    try (InputStream input = TikaInputStream.get(url, metadata)) {
        wrapper.parse(input, null, metadata, context);
    }
    JsonMetadataList.setPrettyPrinting(prettyPrint);
    Writer writer = getOutputWriter(output, encoding);
    try {
        JsonMetadataList.toJson(wrapper.getMetadata(), writer);
    } finally {
        writer.flush();
    }
}
Also used : CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) DocumentInputStream(org.apache.poi.poifs.filesystem.DocumentInputStream) InputStream(java.io.InputStream) JsonMetadata(org.apache.tika.metadata.serialization.JsonMetadata) Metadata(org.apache.tika.metadata.Metadata) XMPMetadata(org.apache.tika.xmp.XMPMetadata) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) PrintWriter(java.io.PrintWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 7 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class TikaGUI method handleStream.

private void handleStream(InputStream input, Metadata md) throws Exception {
    StringWriter htmlBuffer = new StringWriter();
    StringWriter textBuffer = new StringWriter();
    StringWriter textMainBuffer = new StringWriter();
    StringWriter xmlBuffer = new StringWriter();
    StringBuilder metadataBuffer = new StringBuilder();
    ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
    context.set(DocumentSelector.class, new ImageDocumentSelector());
    input = TikaInputStream.get(new ProgressMonitorInputStream(this, "Parsing stream", input));
    if (input.markSupported()) {
        int mark = -1;
        if (input instanceof TikaInputStream) {
            if (((TikaInputStream) input).hasFile()) {
                mark = (int) ((TikaInputStream) input).getLength();
            }
        }
        if (mark == -1) {
            mark = MAX_MARK;
        }
        input.mark(mark);
    }
    parser.parse(input, handler, md, context);
    String[] names = md.names();
    Arrays.sort(names);
    for (String name : names) {
        for (String val : md.getValues(name)) {
            metadataBuffer.append(name);
            metadataBuffer.append(": ");
            metadataBuffer.append(val);
            metadataBuffer.append("\n");
        }
    }
    String name = md.get(Metadata.RESOURCE_NAME_KEY);
    if (name != null && name.length() > 0) {
        setTitle("Apache Tika: " + name);
    } else {
        setTitle("Apache Tika: unnamed document");
    }
    setText(metadata, metadataBuffer.toString());
    setText(xml, xmlBuffer.toString());
    setText(text, textBuffer.toString());
    setText(textMain, textMainBuffer.toString());
    setText(html, htmlBuffer.toString());
    if (!input.markSupported()) {
        setText(json, "InputStream does not support mark/reset for Recursive Parsing");
        layout.show(cards, "metadata");
        return;
    }
    boolean isReset = false;
    try {
        input.reset();
        isReset = true;
    } catch (IOException e) {
        setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + "Try the app with command line argument of -J.");
    }
    if (isReset) {
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
        wrapper.parse(input, null, new Metadata(), new ParseContext());
        StringWriter jsonBuffer = new StringWriter();
        JsonMetadataList.setPrettyPrinting(true);
        JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
        setText(json, jsonBuffer.toString());
    }
    layout.show(cards, "metadata");
}
Also used : ProgressMonitorInputStream(javax.swing.ProgressMonitorInputStream) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) StringWriter(java.io.StringWriter) ParseContext(org.apache.tika.parser.ParseContext) TeeContentHandler(org.apache.tika.sax.TeeContentHandler)

Example 8 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class RTFParserTest method testRegularImages.

//TIKA-1010 test regular (not "embedded") images/picts
@Test
public void testRegularImages() throws Exception {
    Parser base = new AutoDetectParser();
    ParseContext ctx = new ParseContext();
    RecursiveParserWrapper parser = new RecursiveParserWrapper(base, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
    ContentHandler handler = new BodyContentHandler();
    Metadata rootMetadata = new Metadata();
    rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
        parser.parse(tis, handler, rootMetadata, ctx);
    }
    List<Metadata> metadatas = parser.getMetadata();
    //("testJPEG_EXIF_普林斯顿.jpg");
    Metadata meta_jpg_exif = metadatas.get(1);
    //("testJPEG_普林斯顿.jpg");
    Metadata meta_jpg = metadatas.get(3);
    assertTrue(meta_jpg_exif != null);
    assertTrue(meta_jpg != null);
    assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
    assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
    //make sure old metadata doesn't linger between objects
    assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
    assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
    assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
    assertEquals(49, meta_jpg.names().length);
    assertEquals(113, meta_jpg_exif.names().length);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) RTFMetadata(org.apache.tika.metadata.RTFMetadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 9 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class RecursiveMetadataResource method parseMetadata.

private MetadataList parseMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info, String handlerTypeName) throws Exception {
    final Metadata metadata = new Metadata();
    final ParseContext context = new ParseContext();
    Parser parser = TikaResource.createParser();
    // TODO: parameterize choice of max chars/max embedded attachments
    BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(type, -1));
    TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
    // no need to add parser to parse recursively
    TikaResource.fillParseContext(context, httpHeaders, null);
    TikaResource.logRequest(LOG, info, metadata);
    TikaResource.parse(wrapper, LOG, info.getPath(), is, new LanguageHandler() {

        public void endDocument() {
            metadata.set("language", getLanguage().getLanguage());
        }
    }, metadata, context);
    return new MetadataList(wrapper.getMetadata());
}
Also used : MetadataList(org.apache.tika.server.MetadataList) LanguageHandler(org.apache.tika.language.detect.LanguageHandler) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser)

Example 10 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class ParsingExample method recursiveParserWrapperExample.

/**
     * For documents that may contain embedded documents, it might be helpful
     * to create list of metadata objects, one for the container document and
     * one for each embedded document.  This allows easy access to both the
     * extracted content and the metadata of each embedded document.
     * Note that many document formats can contain embedded documents,
     * including traditional container formats -- zip, tar and others -- but also
     * common office document formats including: MSWord, MSExcel,
     * MSPowerPoint, RTF, PDF, MSG and several others.
     * <p>
     * The "content" format is determined by the ContentHandlerFactory, and
     * the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
     * <p>
     * The drawback to the RecursiveParserWrapper is that it caches metadata and contents
     * in memory.  This should not be used on files whose contents are too big to be handled
     * in memory.
     *
     * @return a list of metadata object, one each for the container file and each embedded file
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
public List<Metadata> recursiveParserWrapperExample() throws IOException, SAXException, TikaException {
    Parser p = new AutoDetectParser();
    ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
    ParseContext context = new ParseContext();
    try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) ContentHandlerFactory(org.apache.tika.sax.ContentHandlerFactory) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)11 RecursiveParserWrapper (org.apache.tika.parser.RecursiveParserWrapper)11 ParseContext (org.apache.tika.parser.ParseContext)9 BasicContentHandlerFactory (org.apache.tika.sax.BasicContentHandlerFactory)9 Parser (org.apache.tika.parser.Parser)8 InputStream (java.io.InputStream)7 TikaInputStream (org.apache.tika.io.TikaInputStream)7 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)7 DefaultHandler (org.xml.sax.helpers.DefaultHandler)5 TikaTest (org.apache.tika.TikaTest)4 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 Test (org.junit.Test)4 OutputStreamWriter (java.io.OutputStreamWriter)2 Writer (java.io.Writer)2 EmptyParser (org.apache.tika.parser.EmptyParser)2 ContentHandler (org.xml.sax.ContentHandler)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1