Search in sources :

Example 41 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Seven7ParserTest method test7ZParsing.

@Test
public void test7ZParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    // Ensure 7zip is a parsable format
    assertTrue("No 7zip parser found", parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
    // Parse
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("test-documents/testEXCEL.xls", content);
    assertContains("Sample Excel Worksheet", content);
    assertContains("test-documents/testHTML.html", content);
    assertContains("Test Indexation Html", content);
    assertContains("test-documents/testOpenOffice2.odt", content);
    assertContains("This is a sample Open Office document", content);
    assertContains("test-documents/testPDF.pdf", content);
    assertContains("Apache Tika", content);
    assertContains("test-documents/testPPT.ppt", content);
    assertContains("Sample Powerpoint Slide", content);
    assertContains("test-documents/testRTF.rtf", content);
    assertContains("indexation Word", content);
    assertContains("test-documents/testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("test-documents/testWORD.doc", content);
    assertContains("This is a sample Microsoft Word Document", content);
    assertContains("test-documents/testXML.xml", content);
    assertContains("Rida Benjelloun", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 42 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Seven7ParserTest method testEmbedded.

/**
     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
@Test
public void testEmbedded() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
        parser.parse(stream, handler, metadata, trackingContext);
    }
    // Should have found all 9 documents, but not the directory
    assertEquals(9, tracker.filenames.size());
    assertEquals(9, tracker.mediatypes.size());
    assertEquals(9, tracker.modifiedAts.size());
    // Should have names but not content types, as 7z doesn't
    //  store the content types
    assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
    assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
    assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
    assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
    assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
    assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
    assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
    assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
    assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
    for (String type : tracker.mediatypes) {
        assertNull(type);
    }
    for (String mod : tracker.modifiedAts) {
        assertNotNull(mod);
        assertTrue("Modified at " + mod, mod.startsWith("20"));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 43 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ZlibParserTest method testZlibParsing.

@Test
public void testZlibParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/testTXT.zlib")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("Test d'indexation de Txt", content);
    assertContains("http://www.apache.org", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 44 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class PRTParserTest method testPRTParserComplex.

/**
     * Now a more complex one
     */
@Test
public void testPRTParserComplex() throws Exception {
    try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new PRTParser().parse(input, handler, metadata);
        assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
        // File has both a date and a description
        assertEquals("1997-04-01T08:59:00", metadata.get(Metadata.DATE));
        assertEquals("1997-04-01T08:59:00", metadata.get(Metadata.CREATION_DATE));
        assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n", metadata.get(TikaCoreProperties.DESCRIPTION));
        String contents = handler.toString();
        assertContains("ITEM", contents);
        assertContains("REQ.", contents);
        assertContains("DESCRIPTION", contents);
        assertContains("MAT'L", contents);
        assertContains("TOLERANCES UNLESS", contents);
        assertContains("FRACTIONS", contents);
        assertContains("ANGLES", contents);
        assertContains("Acme Corporation", contents);
        assertContains("DATE", contents);
        assertContains("CHANGE", contents);
        assertContains("DRAWN BY", contents);
        assertContains("SCALE", contents);
        assertContains("TIKA TEST DRAWING", contents);
        assertContains("TIKA LETTERS", contents);
        assertContains("5.82", contents);
        // Degrees
        assertContains("112" + '°', contents);
        assertContains("TIKA TEST LETTER", contents);
        assertContains("17.11", contents);
        // Diameter
        assertContains('Ø' + "�2.000", contents);
        assertContains("Diameter", contents);
        assertContains("The Apache Tika toolkit", contents);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 45 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class PRTParserTest method testPRTParserBasics.

/**
     * Try with a simple file
     */
@Test
public void testPRTParserBasics() throws Exception {
    try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new PRTParser().parse(input, handler, metadata);
        assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
        // This file has a date
        assertEquals("2011-06-20T16:54:00", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2011-06-20T16:54:00", metadata.get(Metadata.CREATION_DATE));
        // But no description
        assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
        String contents = handler.toString();
        assertContains("Front View", contents);
        assertContains("Back View", contents);
        assertContains("Bottom View", contents);
        assertContains("Right View", contents);
        assertContains("Left View", contents);
        //assertContains("Isometric View", contents); // Can't detect yet
        assertContains("Axonometric View", contents);
        assertContains("You've managed to extract all the text!", contents);
        assertContains("This is more text", contents);
        assertContains("Text Inside a PRT file", contents);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)251 Metadata (org.apache.tika.metadata.Metadata)242 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)202 InputStream (java.io.InputStream)189 ParseContext (org.apache.tika.parser.ParseContext)170 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)87 Parser (org.apache.tika.parser.Parser)81 ByteArrayInputStream (java.io.ByteArrayInputStream)65 TikaInputStream (org.apache.tika.io.TikaInputStream)65 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 TikaException (org.apache.tika.exception.TikaException)23 IOException (java.io.IOException)17 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 EmptyParser (org.apache.tika.parser.EmptyParser)14 SAXException (org.xml.sax.SAXException)13 MediaType (org.apache.tika.mime.MediaType)10 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10