Search in sources :

Example 36 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class RecursiveParserWrapperTest method testCharLimit.

@Test
public void testCharLimit() throws Exception {
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
    InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    assertEquals(5, list.size());
    int wlr = 0;
    for (Metadata m : list) {
        String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
        if (limitReached != null && limitReached.equals("true")) {
            wlr++;
        }
    }
    assertEquals(1, wlr);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 37 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class RecursiveParserWrapperTest method getMetadata.

private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions, DigestingParser.Digester digester) throws Exception {
    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    if (digester != null) {
        wrapped = new DigestingParser(wrapped, digester);
    }
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory, catchEmbeddedExceptions);
    String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
    if (path == null) {
        path = "/test-documents/test_recursive_embedded.docx";
    } else {
        path = "/test-documents/" + path;
    }
    InputStream stream = null;
    try {
        stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } finally {
        IOUtils.closeQuietly(stream);
    }
    return wrapper.getMetadata();
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 38 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class RecursiveParserWrapperTest method testMaxEmbedded.

@Test
public void testMaxEmbedded() throws Exception {
    int maxEmbedded = 4;
    //including outer container file
    int totalNoLimit = 12;
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    String limitReached = null;
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
    InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    //test default
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
    wrapper.reset();
    stream.close();
    //test setting value
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(maxEmbedded);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    list = wrapper.getMetadata();
    //add 1 for outer container file
    assertEquals(maxEmbedded + 1, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertEquals("true", limitReached);
    wrapper.reset();
    stream.close();
    //test setting value < 0
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(-2);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 39 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class RecursiveParserWrapperTest method testPrimaryExcWEmbedded.

@Test
public void testPrimaryExcWEmbedded() throws Exception {
    //if embedded content is handled and then
    //the parser hits an exception in the container document,
    //that the first element of the returned list is the container document
    //and the second is the embedded content
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
    String path = "/test-documents/mock/embedded_then_npe.xml";
    InputStream stream = null;
    boolean npe = false;
    try {
        stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } catch (TikaException e) {
        if (e.getCause().getClass().equals(NullPointerException.class)) {
            npe = true;
        }
    } finally {
        IOUtils.closeQuietly(stream);
    }
    assertTrue("npe", npe);
    List<Metadata> metadataList = wrapper.getMetadata();
    assertEquals(2, metadataList.size());
    Metadata outerMetadata = metadataList.get(0);
    Metadata embeddedMetadata = metadataList.get(1);
    assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
    assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
Also used : TikaException(org.apache.tika.exception.TikaException) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 40 with DefaultHandler

use of org.xml.sax.helpers.DefaultHandler in project tika by apache.

the class UnpackerResource method process.

private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
    Metadata metadata = new Metadata();
    ParseContext pc = new ParseContext();
    Parser parser = TikaResource.createParser();
    if (parser instanceof DigestingParser) {
        //no need to digest for unwrapping
        parser = ((DigestingParser) parser).getWrappedParser();
    }
    TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
    TikaResource.logRequest(LOG, info, metadata);
    ContentHandler ch;
    ByteArrayOutputStream text = new ByteArrayOutputStream();
    if (saveAll) {
        ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
    } else {
        ch = new DefaultHandler();
    }
    Map<String, byte[]> files = new HashMap<>();
    MutableInt count = new MutableInt();
    pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
    TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
    if (count.intValue() == 0 && !saveAll) {
        throw new WebApplicationException(Response.Status.NO_CONTENT);
    }
    if (saveAll) {
        files.put(TEXT_FILENAME, text.toByteArray());
        ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
        metadataToCsv(metadata, metaStream);
        files.put(META_FILENAME, metaStream.toByteArray());
    }
    return files;
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) WebApplicationException(javax.ws.rs.WebApplicationException) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) DigestingParser(org.apache.tika.parser.DigestingParser) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) DigestingParser(org.apache.tika.parser.DigestingParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) MutableInt(org.apache.commons.lang.mutable.MutableInt) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

DefaultHandler (org.xml.sax.helpers.DefaultHandler)148 InputStream (java.io.InputStream)65 Metadata (org.apache.tika.metadata.Metadata)59 ParseContext (org.apache.tika.parser.ParseContext)52 Test (org.junit.Test)44 Attributes (org.xml.sax.Attributes)41 SAXParser (javax.xml.parsers.SAXParser)40 SAXException (org.xml.sax.SAXException)39 ByteArrayInputStream (java.io.ByteArrayInputStream)32 SAXParserFactory (javax.xml.parsers.SAXParserFactory)29 IOException (java.io.IOException)26 InputSource (org.xml.sax.InputSource)23 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)22 Parser (org.apache.tika.parser.Parser)22 TikaInputStream (org.apache.tika.io.TikaInputStream)20 ContentHandler (org.xml.sax.ContentHandler)20 File (java.io.File)19 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 FileInputStream (java.io.FileInputStream)15