Search in sources :

Example 66 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ODFParserTest method testOO3.

@Test
public void testOO3() throws Exception {
    for (Parser parser : getParsers()) {
        try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODFwithOOo3.odt")) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            parser.parse(input, handler, metadata, new ParseContext());
            assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
            String content = handler.toString();
            assertContains("Tika is part of the Lucene project.", content);
            assertContains("Solr", content);
            assertContains("one embedded", content);
            assertContains("Rectangle Title", content);
            assertContains("a blue background and dark border", content);
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OpenOfficeParser(org.apache.tika.parser.opendocument.OpenOfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 67 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ArParserTest method testEmbedded.

/**
     * Tests that the ParseContext parser is correctly fired for all the
     * embedded entries.
     */
@Test
public void testEmbedded() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofText.ar")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());
    assertEquals(1, tracker.modifiedAts.size());
    assertEquals("testTXT.txt", tracker.filenames.get(0));
    String modifiedAt = tracker.modifiedAts.get(0);
    assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
    for (String type : tracker.mediatypes) {
        assertNull(type);
    }
    for (String crt : tracker.createdAts) {
        assertNull(crt);
    }
    tracker.reset();
    try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofSND.ar")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());
    assertEquals(1, tracker.modifiedAts.size());
    assertEquals("testAU.au", tracker.filenames.get(0));
    modifiedAt = tracker.modifiedAts.get(0);
    assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
    for (String type : tracker.mediatypes) {
        assertNull(type);
    }
    for (String crt : tracker.createdAts) {
        assertNull(crt);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 68 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Bzip2ParserTest method testEmbedded.

/**
     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
@Test
public void testEmbedded() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.tbz2")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    // Should find a single entry, for the (compressed) tar file
    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());
    assertEquals(1, tracker.modifiedAts.size());
    assertEquals(null, tracker.filenames.get(0));
    assertEquals(null, tracker.mediatypes.get(0));
    assertEquals(null, tracker.createdAts.get(0));
    assertEquals(null, tracker.modifiedAts.get(0));
    // Tar file starts with the directory name
    assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 69 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Bzip2ParserTest method testBzip2Parsing.

@Test
public void testBzip2Parsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream("/test-documents/test-documents.tbz2")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("test-documents/testEXCEL.xls", content);
    assertContains("Sample Excel Worksheet", content);
    assertContains("test-documents/testHTML.html", content);
    assertContains("Test Indexation Html", content);
    assertContains("test-documents/testOpenOffice2.odt", content);
    assertContains("This is a sample Open Office document", content);
    assertContains("test-documents/testPDF.pdf", content);
    assertContains("Apache Tika", content);
    assertContains("test-documents/testPPT.ppt", content);
    assertContains("Sample Powerpoint Slide", content);
    assertContains("test-documents/testRTF.rtf", content);
    assertContains("indexation Word", content);
    assertContains("test-documents/testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("test-documents/testWORD.doc", content);
    assertContains("This is a sample Microsoft Word Document", content);
    assertContains("test-documents/testXML.xml", content);
    assertContains("Rida Benjelloun", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 70 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class CompressParserTest method testEmbedded.

/**
     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
@Test
public void testEmbedded() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.tar.Z");
    try {
        parser.parse(stream, handler, metadata, trackingContext);
    } finally {
        stream.close();
    }
    // Should find a single entry, for the (compressed) tar file
    assertEquals(1, tracker.filenames.size());
    assertEquals(1, tracker.mediatypes.size());
    assertEquals(1, tracker.modifiedAts.size());
    assertEquals(null, tracker.filenames.get(0));
    assertEquals(null, tracker.mediatypes.get(0));
    assertEquals(null, tracker.modifiedAts.get(0));
    // Tar file starts with the directory name
    assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10