Search in sources :

Example 71 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Mp3ParserTest method testTIKA474.

/**
     * This tests that we can handle without errors (but perhaps not
     *  all content) a file with a very very large ID3 frame that
     *  has been truncated before the end of the ID3 tags.
     * In this case, it is a file with JPEG data in the ID3, which
     *  is trunacted before the end of the JPEG bit of the ID3 frame.
     */
@Test
public void testTIKA474() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/testMP3truncated.mp3")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    // Check we could get the headers from the start
    assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("The White Stripes", metadata.get(Metadata.AUTHOR));
    String content = handler.toString();
    assertContains("Girl you have no faith in medicine", content);
    assertContains("The White Stripes", content);
    assertContains("Elephant", content);
    assertContains("2003", content);
    // File lacks any audio frames, so we can't know these
    assertEquals(null, metadata.get("version"));
    assertEquals(null, metadata.get("samplerate"));
    assertEquals(null, metadata.get("channels"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 72 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Mp3ParserTest method testMp3ParsingID3v1v2.

/**
     * Test that with both id3v2 and id3v1, we prefer the
     *  details from id3v2
     */
@Test
public void testMp3ParsingID3v1v2() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/testMP3id3v1_v2.mp3")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
    String content = handler.toString();
    assertContains("Test Title", content);
    assertContains("Test Artist", content);
    assertContains("Test Album", content);
    assertContains("2008", content);
    assertContains("Test Comment", content);
    assertContains("Rock", content);
    assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
    assertEquals("44100", metadata.get("samplerate"));
    assertEquals("1", metadata.get("channels"));
    checkDuration(metadata, 2);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 73 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Mp3ParserTest method testTIKA1589_noId3ReturnsDurationCorrectly.

@Test
public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/testMP3noid3.mp3")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("2455.510986328125", metadata.get(XMPDM.DURATION));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 74 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Mp3ParserTest method testMp3ParsingID3v1.

/**
     * Test that with only ID3v1 tags, we get some information out   
     */
@Test
public void testMp3ParsingID3v1() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/testMP3id3v1.mp3")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
    String content = handler.toString();
    assertContains("Test Title", content);
    assertContains("Test Artist", content);
    assertContains("Test Album", content);
    assertContains("2008", content);
    assertContains("Test Comment", content);
    assertContains("Rock", content);
    assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
    assertEquals("44100", metadata.get("samplerate"));
    assertEquals("1", metadata.get("channels"));
    checkDuration(metadata, 2);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 75 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Mp3ParserTest method testTIKA424.

/**
     * This test will do nothing, unless you've downloaded the
     *  mp3 file from TIKA-424 - the file cannot be
     *  distributed with Tika.
     * This test will check for the complicated set of ID3v2.4
     *  tags.
     */
@Test
public void testTIKA424() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/test2.mp3")) {
        if (stream == null) {
            // Skip the test
            return;
        }
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
    String content = handler.toString();
    assertContains("Plus loin vers l'ouest", content);
    assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
    assertEquals("44100", metadata.get("samplerate"));
    assertEquals("2", metadata.get("channels"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10