Search in sources :

Example 16 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaTest method getXML.

protected XMLResult getXML(String filePath, Parser parser) throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, filePath);
    return getXML(filePath, parser, metadata);
}
Also used : Metadata(org.apache.tika.metadata.Metadata)

Example 17 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TestParsers method testEXCELExtraction.

@Test
public void testEXCELExtraction() throws Exception {
    final String expected = "Numbers and their Squares";
    File file = getResourceAsFile("/test-documents/testEXCEL.xls");
    String s1 = tika.parseToString(file);
    assertTrue("Text does not contain '" + expected + "'", s1.contains(expected));
    Parser parser = tika.getParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(file)) {
        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
    }
    assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) File(java.io.File) FileInputStream(java.io.FileInputStream) Parser(org.apache.tika.parser.Parser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 18 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaDetectorConfigTest method testPSTDetectionWithoutZipDetector.

/**
     * TIKA-1708 - If the Zip detector is disabled, either explicitly,
     *  or via giving a list of detectors that it isn't part of, ensure
     *  that detection of PST files still works
     */
@Test
public void testPSTDetectionWithoutZipDetector() throws Exception {
    // Check the one with an exclude
    TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml");
    assertNotNull(configWX.getParser());
    assertNotNull(configWX.getDetector());
    CompositeDetector detectorWX = (CompositeDetector) configWX.getDetector();
    // Check it has the POIFS one, but not the zip one
    assertDetectors(detectorWX, true, false);
    // Check the one with an explicit list
    TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml");
    assertNotNull(configCL.getParser());
    assertNotNull(configCL.getDetector());
    CompositeDetector detectorCL = (CompositeDetector) configCL.getDetector();
    assertEquals(2, detectorCL.getDetectors().size());
    // Check it also has the POIFS one, but not the zip one
    assertDetectors(detectorCL, true, false);
    // Check that both detectors have a mimetypes with entries
    assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(), configWX.getMediaTypeRegistry().getTypes().size() > 100);
    assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(), configCL.getMediaTypeRegistry().getTypes().size() > 100);
    // Now check they detect PST files correctly
    TikaInputStream stream = TikaInputStream.get(getResourceAsFile("/test-documents/testPST.pst"));
    assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorWX.detect(stream, new Metadata()));
    assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorCL.detect(stream, new Metadata()));
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) Test(org.junit.Test)

Example 19 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TikaEncodingDetectorTest method testConfigurabilityOfUserSpecified.

@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
    TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml"));
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    //make sure that all static and non-static parsers are using the same encoding detector!
    List<Parser> parsers = new ArrayList<>();
    findEncodingDetectionParsers(p, parsers);
    assertEquals(3, parsers.size());
    for (Parser encodingDetectingParser : parsers) {
        EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
        assertTrue(encodingDetector instanceof CompositeEncodingDetector);
        assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
        for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector).getDetectors()) {
            assertNotContained("cu4j", child.getClass().getCanonicalName());
        }
    }
    //also just make sure this is still true
    try {
        Metadata metadata = getXML("english.cp500.txt", p).metadata;
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
}
Also used : Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) HtmlEncodingDetector(org.apache.tika.parser.html.HtmlEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) TikaException(org.apache.tika.exception.TikaException) ArrayList(java.util.ArrayList) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TXTParser(org.apache.tika.parser.txt.TXTParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) Test(org.junit.Test)

Example 20 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class AutoDetectParserTest method testOggFlacAudio.

/**
     * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
     *  have been correctly included, and are available
     */
@SuppressWarnings("deprecation")
@Test
public void testOggFlacAudio() throws Exception {
    // The three test files should all have similar test data
    String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" };
    MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) };
    // Check we can load the parsers, and they claim to do the right things
    VorbisParser vParser = new VorbisParser();
    assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext()));
    FlacParser fParser = new FlacParser();
    assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext()));
    assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext()));
    OpusParser oParser = new OpusParser();
    assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext()));
    // Check we found the parser
    CompositeParser parser = (CompositeParser) tika.getParser();
    for (MediaType mt : mediaTypes) {
        assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt));
    }
    // Have each file parsed, and check
    for (int i = 0; i < testFiles.length; i++) {
        String file = testFiles[i];
        try (InputStream input = AutoDetectParserTest.class.getResourceAsStream("/test-documents/" + file)) {
            if (input == null) {
                fail("Could not find test file " + file);
            }
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser(tika).parse(input, handler, metadata);
            assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
            // Check some of the common metadata
            // Old style metadata
            assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
            assertEquals("Test Title", metadata.get(Metadata.TITLE));
            // New style metadata
            assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
            // Check some of the XMPDM metadata
            if (!file.endsWith(".opus")) {
                assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
            }
            assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
            assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
            assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
            // Check some of the text
            String content = handler.toString();
            assertTrue(content.contains("Test Title"));
            assertTrue(content.contains("Test Artist"));
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) VorbisParser(org.gagravarr.tika.VorbisParser) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) MediaType(org.apache.tika.mime.MediaType) FlacParser(org.gagravarr.tika.FlacParser) OpusParser(org.gagravarr.tika.OpusParser) Test(org.junit.Test)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)643 Test (org.junit.Test)467 InputStream (java.io.InputStream)318 ParseContext (org.apache.tika.parser.ParseContext)281 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)268 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)228 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)151 ByteArrayInputStream (java.io.ByteArrayInputStream)141 Parser (org.apache.tika.parser.Parser)134 TikaInputStream (org.apache.tika.io.TikaInputStream)131 IOException (java.io.IOException)62 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)46 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)28 FileInputStream (java.io.FileInputStream)27 MediaType (org.apache.tika.mime.MediaType)27