Search in sources :

Example 76 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class PowerPointParserTest method testPowerPointParser.

@Test
public void testPowerPointParser() throws Exception {
    try (InputStream input = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT.ppt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("Sample Powerpoint Slide", content);
        assertContains("Powerpoint X for Mac", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 77 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class PowerPointParserTest method testMasterText.

/**
     * TIKA-712 Master Slide Text from PPT and PPTX files
     *  should be extracted too
     */
@Test
public void testMasterText() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterText.ppt")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Text that I added to the master slide", content);
    // Make sure boilerplate text didn't come through:
    assertEquals(-1, content.indexOf("Click to edit Master"));
    //TIKA-1171
    assertEquals(-1, content.indexOf("*"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 78 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class PowerPointParserTest method testMasterFooter.

@Test
public void testMasterFooter() throws Exception {
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterFooter.ppt")) {
        new OfficeParser().parse(stream, handler, metadata, new ParseContext());
    }
    String content = handler.toString();
    assertContains("Master footer is here", content);
    // Make sure boilerplate text didn't come through:
    assertEquals(-1, content.indexOf("Click to edit Master"));
    //TIKA-1171
    assertEquals(-1, content.indexOf("*"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 79 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class PublisherParserTest method testPublisherParser.

@Test
public void testPublisherParser() throws Exception {
    try (InputStream input = PublisherParserTest.class.getResourceAsStream("/test-documents/testPUBLISHER.pub")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new OfficeParser().parse(input, handler, metadata, new ParseContext());
        assertEquals("application/x-mspublisher", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
        String content = handler.toString();
        assertContains("0123456789", content);
        assertContains("abcdef", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 80 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ForkParserTest method testParallelParsing.

@Test
public void testParallelParsing() throws Exception {
    final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
    try {
        final ParseContext context = new ParseContext();
        Thread[] threads = new Thread[10];
        ContentHandler[] output = new ContentHandler[threads.length];
        for (int i = 0; i < threads.length; i++) {
            final ContentHandler o = new BodyContentHandler();
            output[i] = o;
            threads[i] = new Thread() {

                public void run() {
                    try {
                        InputStream stream = new ByteArrayInputStream(new byte[0]);
                        parser.parse(stream, o, new Metadata(), context);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            };
            threads[i].start();
        }
        for (int i = 0; i < threads.length; i++) {
            threads[i].join();
            assertEquals("Hello, World!", output[i].toString().trim());
        }
    } finally {
        parser.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) PipedInputStream(java.io.PipedInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)338 Metadata (org.apache.tika.metadata.Metadata)283 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)164 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)118 Parser (org.apache.tika.parser.Parser)109 ByteArrayInputStream (java.io.ByteArrayInputStream)92 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)30 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)25 SAXException (org.xml.sax.SAXException)25 CompositeParser (org.apache.tika.parser.CompositeParser)22 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)20