Search in sources :

Example 76 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Mp3ParserTest method testMp3ParsingLyrics.

/**
     * Tests that a file with both lyrics and
     *  ID3v2 tags gets both extracted correctly
     */
@Test
public void testMp3ParsingLyrics() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/testMP3lyrics.mp3")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
    String content = handler.toString();
    assertContains("Test Title", content);
    assertContains("Test Artist", content);
    assertContains("Test Album", content);
    assertContains("2008", content);
    assertContains("Test Comment", content);
    assertContains("Rock", content);
    assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
    assertEquals("44100", metadata.get("samplerate"));
    assertEquals("2", metadata.get("channels"));
    checkDuration(metadata, 1);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 77 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ODFParserTest method testOO2Metadata.

/**
    * Similar to {@link #testOO2()}, but using a different
    *  OO2 file with different metadata in it
    */
@Test
public void testOO2Metadata() throws Exception {
    try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testOpenOffice2.odf")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new OpenDocumentParser().parse(input, handler, metadata);
        assertEquals("application/vnd.oasis.opendocument.formula", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
        assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT));
        assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
        assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
        assertEquals("1", metadata.get("editing-cycles"));
        assertEquals("OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134", metadata.get("generator"));
        assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
        // User defined metadata
        assertEquals("Text 1", metadata.get("custom:Info 1"));
        assertEquals("2", metadata.get("custom:Info 2"));
        assertEquals("false", metadata.get("custom:Info 3"));
        assertEquals("true", metadata.get("custom:Info 4"));
        // No statistics present
        assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
        assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
        assertEquals(null, metadata.get(Metadata.WORD_COUNT));
        assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
        assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
        assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
        assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
        assertEquals(null, metadata.get("nbTab"));
        assertEquals(null, metadata.get("nbObject"));
        assertEquals(null, metadata.get("nbImg"));
        assertEquals(null, metadata.get("nbPage"));
        assertEquals(null, metadata.get("nbPara"));
        assertEquals(null, metadata.get("nbWord"));
        assertEquals(null, metadata.get("nbCharacter"));
        // Note - contents of maths files not currently supported
        String content = handler.toString().trim();
        assertEquals("", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 78 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ODFParserTest method testNullStylesInODTFooter.

//TIKA-1600: Test that null pointer doesn't break parsing.
@Test
public void testNullStylesInODTFooter() throws Exception {
    Parser parser = new OpenDocumentParser();
    try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        parser.parse(input, handler, metadata, getNonRecursingParseContext());
        assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
        String content = handler.toString();
        assertContains("Utilisation de ce document", content);
        assertContains("Copyright and License", content);
        assertContains("Changer la langue", content);
        assertContains("La page d’accueil permet de faire une recherche simple", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) OpenOfficeParser(org.apache.tika.parser.opendocument.OpenOfficeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 79 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ODFParserTest method testODTFooter.

@Test
public void testODTFooter() throws Exception {
    try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testFooter.odt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new AutoDetectParser().parse(input, handler, metadata);
        String content = handler.toString();
        assertContains("Here is some text...", content);
        assertContains("Here is some text on page 2", content);
        assertContains("Here is footer text", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 80 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ODFParserTest method testNPEFromFile.

@Test
public void testNPEFromFile() throws Exception {
    OpenDocumentParser parser = new OpenDocumentParser();
    try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource("/test-documents/testNPEOpenDocument.odt"))) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        parser.parse(tis, handler, metadata, new ParseContext());
        assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
        String content = handler.toString();
        assertContains("primero hay que generar un par de claves", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10