Search in sources :

Example 96 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class PowerPointParserTest method testMasterFooter.

@Test
public void testMasterFooter() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterFooter.ppt")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Master footer is here", content);
    // Make sure boilerplate text didn't come through:
    assertEquals(-1, content.indexOf("Click to edit Master"));
    //TIKA-1171
    assertEquals(-1, content.indexOf("*"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 97 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class PublisherParserTest method testPublisherParser.

@Test
public void testPublisherParser() throws Exception {
    try (InputStream input = PublisherParserTest.class.getResourceAsStream("/test-documents/testPUBLISHER.pub")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/x-mspublisher", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("0123456789", content);
        assertContains("abcdef", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 98 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ForkParserTest method testParallelParsing.

@Test
public void testParallelParsing() throws Exception {
    final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
    try {
        final ParseContext context = new ParseContext();
        Thread[] threads = new Thread[10];
        ContentHandler[] output = new ContentHandler[threads.length];
        for (int i = 0; i < threads.length; i++) {
            final ContentHandler o = new BodyContentHandler();
            output[i] = o;
            threads[i] = new Thread() {

                public void run() {
                    try {
                        InputStream stream = new ByteArrayInputStream(new byte[0]);
                        parser.parse(stream, o, new Metadata(), context);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            };
            threads[i].start();
        }
        for (int i = 0; i < threads.length; i++) {
            threads[i].join();
            assertEquals("Hello, World!", output[i].toString().trim());
        }
    } finally {
        parser.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) PipedInputStream(java.io.PipedInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test)

Example 99 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ForkParserTest method testPoolSizeReached.

@Test
public void testPoolSizeReached() throws Exception {
    final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
    try {
        final Semaphore barrier = new Semaphore(0);
        Thread[] threads = new Thread[parser.getPoolSize()];
        PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
        final ParseContext context = new ParseContext();
        for (int i = 0; i < threads.length; i++) {
            final PipedInputStream input = new PipedInputStream() {

                @Override
                public synchronized int read() throws IOException {
                    barrier.release();
                    return super.read();
                }
            };
            pipes[i] = new PipedOutputStream(input);
            threads[i] = new Thread() {

                public void run() {
                    try {
                        ContentHandler o = new DefaultHandler();
                        parser.parse(input, o, new Metadata(), context);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            };
            threads[i].start();
        }
        // Wait until all the background parsers have been started
        barrier.acquire(parser.getPoolSize());
        final ContentHandler o = new BodyContentHandler();
        Thread blocked = new Thread() {

            public void run() {
                try {
                    barrier.release();
                    InputStream stream = new ByteArrayInputStream(new byte[0]);
                    parser.parse(stream, o, new Metadata(), context);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        };
        blocked.start();
        // Wait until the last thread is started, and then some to
        // make sure that it would have had a chance to start processing
        // data had it not been blocked.
        barrier.acquire();
        Thread.sleep(1000);
        assertEquals("", o.toString());
        for (int i = 0; i < threads.length; i++) {
            pipes[i].close();
            threads[i].join();
        }
        blocked.join();
        assertEquals("Hello, World!", o.toString().trim());
    } finally {
        parser.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) PipedInputStream(java.io.PipedInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) PipedOutputStream(java.io.PipedOutputStream) Semaphore(java.util.concurrent.Semaphore) PipedInputStream(java.io.PipedInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test)

Example 100 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class BasicContentHandlerFactoryTest method testBody.

@Test
public void testBody() throws Exception {
    Parser p = new MockParser(OVER_DEFAULT);
    BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
    ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler();
    assertTrue(handler instanceof BodyContentHandler);
    p.parse(null, handler, null, null);
    String extracted = handler.toString();
    assertNotContains("title", extracted);
    assertContains("aaaaaaaaaa", extracted);
    assertTrue(extracted.length() > 110000);
    //now test write limit
    p = new MockParser(10);
    handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
    assertTrue(handler instanceof BodyContentHandler);
    assertWriteLimitReached(p, (BodyContentHandler) handler);
    extracted = handler.toString();
    assertNotContains("This ", extracted);
    assertContains("aaaa", extracted);
    //now test outputstream call
    p = new MockParser(OVER_DEFAULT);
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
    assertTrue(handler instanceof BodyContentHandler);
    p.parse(null, handler, null, null);
    assertNotContains("title", os.toByteArray());
    assertContains("aaaaaaaaaa", os.toByteArray());
    assertNotContains("<body", os.toByteArray());
    assertNotContains("<html", os.toByteArray());
    assertTrue(os.toByteArray().length > 110000);
    p = new MockParser(10);
    os = new ByteArrayOutputStream();
    handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
    assertTrue(handler instanceof WriteOutContentHandler);
    assertWriteLimitReached(p, (WriteOutContentHandler) handler);
    assertEquals(0, os.toByteArray().length);
}
Also used : ByteArrayOutputStream(java.io.ByteArrayOutputStream) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) Test(org.junit.Test)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20