Search in sources :

Example 46 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class Mp3ParserTest method testTIKA424.

/**
     * This test will do nothing, unless you've downloaded the
     *  mp3 file from TIKA-424 - the file cannot be
     *  distributed with Tika.
     * This test will check for the complicated set of ID3v2.4
     *  tags.
     */
@Test
public void testTIKA424() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/test2.mp3")) {
        if (stream == null) {
            // Skip the test
            return;
        }
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Plus loin vers l'ouest", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Merzhin", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Merzhin", metadata.get(Metadata.AUTHOR));
    String content = handler.toString();
    assertContains("Plus loin vers l'ouest", content);
    assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
    assertEquals("44100", metadata.get("samplerate"));
    assertEquals("2", metadata.get("channels"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 47 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class Mp3ParserTest method testMp3ParsingLyrics.

/**
     * Tests that a file with both lyrics and
     *  ID3v2 tags gets both extracted correctly
     */
@Test
public void testMp3ParsingLyrics() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Mp3ParserTest.class.getResourceAsStream("/test-documents/testMP3lyrics.mp3")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
    String content = handler.toString();
    assertContains("Test Title", content);
    assertContains("Test Artist", content);
    assertContains("Test Album", content);
    assertContains("2008", content);
    assertContains("Test Comment", content);
    assertContains("Rock", content);
    assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
    assertEquals("44100", metadata.get("samplerate"));
    assertEquals("2", metadata.get("channels"));
    checkDuration(metadata, 1);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 48 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ODFParserTest method testODTFooter.

@Test
public void testODTFooter() throws Exception {
    try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testFooter.odt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new AutoDetectParser().parse(input, handler, metadata);
        String content = handler.toString();
        assertContains("Here is some text...", content);
        assertContains("Here is some text on page 2", content);
        assertContains("Here is footer text", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 49 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class PDFParserTest method testAnnotations.

@Test
public void testAnnotations() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    String content;
    try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
        content = getText(stream, parser);
    }
    content = content.replaceAll("[\\s ]+", " ");
    assertContains("Here is some text", content);
    assertContains("Here is a comment", content);
    // Test w/ annotation text disabled:
    PDFParser pdfParser = new PDFParser();
    pdfParser.getPDFParserConfig().setExtractAnnotationText(false);
    try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
        content = getText(stream, pdfParser);
    }
    content = content.replaceAll("[\\s ]+", " ");
    assertContains("Here is some text", content);
    assertEquals(-1, content.indexOf("Here is a comment"));
    // annotation text disabled through parsecontext
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    config.setExtractAnnotationText(false);
    context.set(PDFParserConfig.class, config);
    try (InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf")) {
        content = getText(stream, parser, context);
    }
    content = content.replaceAll("[\\s ]+", " ");
    assertContains("Here is some text", content);
    assertEquals(-1, content.indexOf("Here is a comment"));
    // TIKA-738: make sure no extra </p> tags
    String xml = getXML("testAnnotations.pdf").xml;
    assertEquals(substringCount("<p>", xml), substringCount("</p>", xml));
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 50 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class PDFParserTest method testEmbeddedFileMarkup.

//TIKA-1427
@Test
public void testEmbeddedFileMarkup() throws Exception {
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(org.apache.tika.parser.Parser.class, parser);
    PDFParserConfig config = new PDFParserConfig();
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);
    context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
    XMLResult r = getXML("testPDF_childAttachments.pdf", context);
    //regular attachment
    assertContains("<div source=\"attachment\" class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
    //inline image
    assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", r.xml);
    //doc embedded inside an annotation
    r = getXML("testPDFFileEmbInAnnotation.pdf");
    assertContains("<div source=\"annotation\" class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12