Search in sources :

Example 6 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testMaxEmbedded.

@Test
public void testMaxEmbedded() throws Exception {
    int maxEmbedded = 4;
    //including outer container file
    int totalNoLimit = 12;
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    String limitReached = null;
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
    InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    //test default
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
    wrapper.reset();
    stream.close();
    //test setting value
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(maxEmbedded);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    list = wrapper.getMetadata();
    //add 1 for outer container file
    assertEquals(maxEmbedded + 1, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertEquals("true", limitReached);
    wrapper.reset();
    stream.close();
    //test setting value < 0
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(-2);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 7 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testPrimaryExcWEmbedded.

@Test
public void testPrimaryExcWEmbedded() throws Exception {
    //if embedded content is handled and then
    //the parser hits an exception in the container document,
    //that the first element of the returned list is the container document
    //and the second is the embedded content
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
    String path = "/test-documents/mock/embedded_then_npe.xml";
    InputStream stream = null;
    boolean npe = false;
    try {
        stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } catch (TikaException e) {
        if (e.getCause().getClass().equals(NullPointerException.class)) {
            npe = true;
        }
    } finally {
        IOUtils.closeQuietly(stream);
    }
    assertTrue("npe", npe);
    List<Metadata> metadataList = wrapper.getMetadata();
    assertEquals(2, metadataList.size());
    Metadata outerMetadata = metadataList.get(0);
    Metadata embeddedMetadata = metadataList.get(1);
    assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
    assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
Also used : TikaException(org.apache.tika.exception.TikaException) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 8 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class TesseractOCRParserTest method runOCR.

private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
    TesseractOCRConfig config = new TesseractOCRConfig();
    config.setOutputType(outputType);
    Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
    PDFParserConfig pdfConfig = new PDFParserConfig();
    pdfConfig.setExtractInlineImages(true);
    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, config);
    parseContext.set(Parser.class, parser);
    parseContext.set(PDFParserConfig.class, pdfConfig);
    try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
        parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
    }
    List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
    assertEquals(numMetadatas, metadataList.size());
    StringBuilder contents = new StringBuilder();
    for (Metadata m : metadataList) {
        contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
    }
    for (String needle : nonOCRContains) {
        assertContains(needle, contents.toString());
    }
    assertTrue(metadataList.get(0).names().length > 10);
    assertTrue(metadataList.get(1).names().length > 10);
    //test at least one value
    assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
    return contents.toString();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) ExternalParser(org.apache.tika.parser.external.ExternalParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ImageParser(org.apache.tika.parser.image.ImageParser) DefaultParser(org.apache.tika.parser.DefaultParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PDFParserConfig(org.apache.tika.parser.pdf.PDFParserConfig)

Example 9 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperFSConsumerTest method testEmbeddedWithNPE.

@Test
public void testEmbeddedWithNPE() throws Exception {
    final String path = "/test-documents/embedded_with_npe.xml";
    final Metadata metadata = new Metadata();
    metadata.add(Metadata.RESOURCE_NAME_KEY, "embedded_with_npe.xml");
    ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<FileResource>(2);
    queue.add(new FileResource() {

        @Override
        public String getResourceId() {
            return "testFile";
        }

        @Override
        public Metadata getMetadata() {
            return metadata;
        }

        @Override
        public InputStream openInputStream() throws IOException {
            return this.getClass().getResourceAsStream(path);
        }
    });
    queue.add(new PoisonFileResource());
    MockOSFactory mockOSFactory = new MockOSFactory();
    RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), mockOSFactory, new TikaConfig());
    IFileProcessorFutureResult result = consumer.call();
    mockOSFactory.getStreams().get(0).flush();
    byte[] bytes = mockOSFactory.getStreams().get(0).toByteArray();
    List<Metadata> results = JsonMetadataList.fromJson(new InputStreamReader(new ByteArrayInputStream(bytes), UTF_8));
    assertEquals(4, results.size());
    assertContains("another null pointer", results.get(2).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
    assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
    for (int i = 1; i < 4; i++) {
        assertEquals("embeddedAuthor" + i, results.get(i).get("author"));
        assertContains("some_embedded_content" + i, results.get(i).get(RecursiveParserWrapper.TIKA_CONTENT));
    }
}
Also used : RecursiveParserWrapperFSConsumer(org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaConfig(org.apache.tika.config.TikaConfig) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ByteArrayInputStream(java.io.ByteArrayInputStream) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 10 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class ParsingExample method recursiveParserWrapperExample.

/**
     * For documents that may contain embedded documents, it might be helpful
     * to create list of metadata objects, one for the container document and
     * one for each embedded document.  This allows easy access to both the
     * extracted content and the metadata of each embedded document.
     * Note that many document formats can contain embedded documents,
     * including traditional container formats -- zip, tar and others -- but also
     * common office document formats including: MSWord, MSExcel,
     * MSPowerPoint, RTF, PDF, MSG and several others.
     * <p>
     * The "content" format is determined by the ContentHandlerFactory, and
     * the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
     * <p>
     * The drawback to the RecursiveParserWrapper is that it caches metadata and contents
     * in memory.  This should not be used on files whose contents are too big to be handled
     * in memory.
     *
     * @return a list of metadata object, one each for the container file and each embedded file
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
public List<Metadata> recursiveParserWrapperExample() throws IOException, SAXException, TikaException {
    Parser p = new AutoDetectParser();
    ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
    ParseContext context = new ParseContext();
    try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) ContentHandlerFactory(org.apache.tika.sax.ContentHandlerFactory) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Aggregations

BasicContentHandlerFactory (org.apache.tika.sax.BasicContentHandlerFactory)22 Metadata (org.apache.tika.metadata.Metadata)21 Test (org.junit.Test)16 InputStream (java.io.InputStream)10 TikaInputStream (org.apache.tika.io.TikaInputStream)9 RecursiveParserWrapper (org.apache.tika.parser.RecursiveParserWrapper)9 ParseContext (org.apache.tika.parser.ParseContext)8 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)7 Parser (org.apache.tika.parser.Parser)7 DefaultHandler (org.xml.sax.helpers.DefaultHandler)7 TikaTest (org.apache.tika.TikaTest)6 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)2 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)2 RecursiveParserWrapperFSConsumer (org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer)2 TikaConfig (org.apache.tika.config.TikaConfig)2 EmptyParser (org.apache.tika.parser.EmptyParser)2 ContentHandler (org.xml.sax.ContentHandler)2