Search in sources :

Example 1 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class TikaTest method getRecursiveMetadata.

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 2 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class TesseractOCRParserTest method runOCR.

private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
    TesseractOCRConfig config = new TesseractOCRConfig();
    config.setOutputType(outputType);
    Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
    PDFParserConfig pdfConfig = new PDFParserConfig();
    pdfConfig.setExtractInlineImages(true);
    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, config);
    parseContext.set(Parser.class, parser);
    parseContext.set(PDFParserConfig.class, pdfConfig);
    try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
        parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
    }
    List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
    assertEquals(numMetadatas, metadataList.size());
    StringBuilder contents = new StringBuilder();
    for (Metadata m : metadataList) {
        contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
    }
    for (String needle : nonOCRContains) {
        assertContains(needle, contents.toString());
    }
    assertTrue(metadataList.get(0).names().length > 10);
    assertTrue(metadataList.get(1).names().length > 10);
    //test at least one value
    assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
    return contents.toString();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) ExternalParser(org.apache.tika.parser.external.ExternalParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ImageParser(org.apache.tika.parser.image.ImageParser) DefaultParser(org.apache.tika.parser.DefaultParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PDFParserConfig(org.apache.tika.parser.pdf.PDFParserConfig)

Example 3 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class RecursiveParserWrapperFSConsumer method processFileResource.

@Override
public boolean processFileResource(FileResource fileResource) {
    Parser wrapped = parserFactory.getParser(tikaConfig);
    RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory);
    ParseContext context = new ParseContext();
    //        if (parseRecursively == true) {
    context.set(Parser.class, parser);
    //        }
    //try to open outputstream first
    OutputStream os = getOutputStream(fsOSFactory, fileResource);
    if (os == null) {
        LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH));
        return false;
    }
    //try to open the inputstream before the parse.
    //if the parse hangs or throws a nasty exception, at least there will
    //be a zero byte file there so that the batchrunner can skip that problematic
    //file during the next run.
    InputStream is = getInputStream(fileResource);
    if (is == null) {
        IOUtils.closeQuietly(os);
        return false;
    }
    Throwable thrown = null;
    List<Metadata> metadataList = null;
    Metadata containerMetadata = fileResource.getMetadata();
    try {
        parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), containerMetadata, context);
        metadataList = parser.getMetadata();
    } catch (Throwable t) {
        thrown = t;
        metadataList = parser.getMetadata();
        if (metadataList == null) {
            metadataList = new LinkedList<>();
        }
        Metadata m = null;
        if (metadataList.size() == 0) {
            m = containerMetadata;
        } else {
            //take the top metadata item
            m = metadataList.remove(0);
        }
        String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
        m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime", stackTrace);
        metadataList.add(0, m);
    } finally {
        IOUtils.closeQuietly(is);
    }
    Writer writer = null;
    try {
        writer = new OutputStreamWriter(os, getOutputEncoding());
        JsonMetadataList.toJson(metadataList, writer);
    } catch (Exception e) {
        //this is a stop the world kind of thing
        LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e));
        throw new RuntimeException(e);
    } finally {
        flushAndClose(writer);
    }
    if (thrown != null) {
        if (thrown instanceof Error) {
            throw (Error) thrown;
        } else {
            return false;
        }
    }
    return true;
}
Also used : InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) Metadata(org.apache.tika.metadata.Metadata) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) LinkedList(java.util.LinkedList) Parser(org.apache.tika.parser.Parser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 4 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class ParsingExample method recursiveParserWrapperExample.

/**
     * For documents that may contain embedded documents, it might be helpful
     * to create list of metadata objects, one for the container document and
     * one for each embedded document.  This allows easy access to both the
     * extracted content and the metadata of each embedded document.
     * Note that many document formats can contain embedded documents,
     * including traditional container formats -- zip, tar and others -- but also
     * common office document formats including: MSWord, MSExcel,
     * MSPowerPoint, RTF, PDF, MSG and several others.
     * <p>
     * The "content" format is determined by the ContentHandlerFactory, and
     * the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
     * <p>
     * The drawback to the RecursiveParserWrapper is that it caches metadata and contents
     * in memory.  This should not be used on files whose contents are too big to be handled
     * in memory.
     *
     * @return a list of metadata object, one each for the container file and each embedded file
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
public List<Metadata> recursiveParserWrapperExample() throws IOException, SAXException, TikaException {
    Parser p = new AutoDetectParser();
    ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
    ParseContext context = new ParseContext();
    try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) ContentHandlerFactory(org.apache.tika.sax.ContentHandlerFactory) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 5 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class PDFParserTest method testEmbeddedFilesInChildren.

// TIKA-1228, TIKA-1268
@Test
public void testEmbeddedFilesInChildren() throws Exception {
    String xml = getXML("/testPDF_childAttachments.pdf").xml;
    //"regressiveness" exists only in Unit10.doc not in the container pdf document
    assertTrue(xml.contains("regressiveness"));
    RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);
    context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
    context.set(org.apache.tika.parser.Parser.class, p);
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
        p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
    }
    List<Metadata> metadatas = p.getMetadata();
    assertEquals(5, metadatas.size());
    assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
    assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
    assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
    assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)11 RecursiveParserWrapper (org.apache.tika.parser.RecursiveParserWrapper)11 ParseContext (org.apache.tika.parser.ParseContext)9 BasicContentHandlerFactory (org.apache.tika.sax.BasicContentHandlerFactory)9 Parser (org.apache.tika.parser.Parser)8 InputStream (java.io.InputStream)7 TikaInputStream (org.apache.tika.io.TikaInputStream)7 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)7 DefaultHandler (org.xml.sax.helpers.DefaultHandler)5 TikaTest (org.apache.tika.TikaTest)4 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 Test (org.junit.Test)4 OutputStreamWriter (java.io.OutputStreamWriter)2 Writer (java.io.Writer)2 EmptyParser (org.apache.tika.parser.EmptyParser)2 ContentHandler (org.xml.sax.ContentHandler)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1