Search in sources :

Example 56 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class ExtractReader method generateListFromTextFile.

private List<Metadata> generateListFromTextFile(Reader reader, FileSuffixes fileSuffixes) throws IOException {
    List<Metadata> metadataList = new ArrayList<>();
    String content = IOUtils.toString(reader);
    Metadata m = new Metadata();
    m.set(RecursiveParserWrapper.TIKA_CONTENT, content);
    //Let's hope the file name has a suffix that can
    //be used to determine the mime.  Could be wrong or missing,
    //but better than nothing.
    m.set(Metadata.RESOURCE_NAME_KEY, fileSuffixes.originalFileName);
    MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m);
    if (mimeType != null) {
        m.set(Metadata.CONTENT_TYPE, mimeType.toString());
    }
    metadataList.add(m);
    return metadataList;
}
Also used : ArrayList(java.util.ArrayList) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType)

Example 57 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaParserConfigTest method testParserExcludeFromDefault.

@Test
public void testParserExcludeFromDefault() throws Exception {
    TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    CompositeParser parser = (CompositeParser) config.getParser();
    MediaType PE_EXE = MediaType.application("x-msdownload");
    MediaType ELF = MediaType.application("x-elf");
    // Get the DefaultParser from the config
    ParserDecorator confWrappedParser = (ParserDecorator) parser.getParsers().get(MediaType.APPLICATION_XML);
    assertNotNull(confWrappedParser);
    DefaultParser confParser = (DefaultParser) confWrappedParser.getWrappedParser();
    // Get a fresh "default" DefaultParser
    DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry());
    // The default one will offer the Executable Parser
    assertContains(PE_EXE, normParser.getSupportedTypes(context));
    assertContains(ELF, normParser.getSupportedTypes(context));
    boolean hasExec = false;
    for (Parser p : normParser.getParsers().values()) {
        if (p instanceof ExecutableParser) {
            hasExec = true;
            break;
        }
    }
    assertTrue(hasExec);
    // The one from the config won't
    assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
    assertNotContained(ELF, confParser.getSupportedTypes(context));
    for (Parser p : confParser.getParsers().values()) {
        if (p instanceof ExecutableParser)
            fail("Shouldn't have the Executable Parser from config");
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) DefaultParser(org.apache.tika.parser.DefaultParser) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Example 58 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class POIContainerExtractionTest method testPowerpointImages.

@Test
public void testPowerpointImages() throws Exception {
    ContainerExtractor extractor = new ParserContainerExtractor();
    TrackingHandler handler;
    handler = process("pictures.ppt", extractor, false);
    assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
    assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
}
Also used : MediaType(org.apache.tika.mime.MediaType) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test)

Example 59 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class ExcelParserTest method testExcel95.

/**
     * Excel 5 and 95 are older formats, and only get basic support
     */
@Test
public void testExcel95() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    MediaType type;
    Metadata m;
    // First try detection of Excel 5
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // Now Excel 95
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // OfficeParser can handle it
    assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
    // Parse the Excel 5 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet names
        assertContains("Feuil1", content);
        assertContains("Feuil3", content);
        // Text
        assertContains("Sample Excel", content);
        assertContains("Number", content);
        // Numbers
        assertContains("15", content);
        assertContains("225", content);
        // Metadata was also fetched
        assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
    }
    // Parse the Excel 95 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet name
        assertContains("Foglio1", content);
        // Very boring file, no actual text or numbers!
        // Metadata was also fetched
        assertEquals(null, m.get(TikaCoreProperties.TITLE));
        assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 60 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaCLI method displaySupportedTypes.

/**
     * Prints all the known media types, aliases and matching parser classes.
     */
private void displaySupportedTypes() {
    AutoDetectParser parser = new AutoDetectParser();
    MediaTypeRegistry registry = parser.getMediaTypeRegistry();
    Map<MediaType, Parser> parsers = parser.getParsers();
    for (MediaType type : registry.getTypes()) {
        System.out.println(type);
        for (MediaType alias : registry.getAliases(type)) {
            System.out.println("  alias:     " + alias);
        }
        MediaType supertype = registry.getSupertype(type);
        if (supertype != null) {
            System.out.println("  supertype: " + supertype);
        }
        Parser p = parsers.get(type);
        if (p != null) {
            if (p instanceof CompositeParser) {
                p = ((CompositeParser) p).getParsers().get(type);
            }
            System.out.println("  parser:    " + p.getClass().getName());
        }
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) NetworkParser(org.apache.tika.parser.NetworkParser) ForkParser(org.apache.tika.fork.ForkParser)

Aggregations

MediaType (org.apache.tika.mime.MediaType)95 Metadata (org.apache.tika.metadata.Metadata)29 Test (org.junit.Test)28 InputStream (java.io.InputStream)26 IOException (java.io.IOException)18 Parser (org.apache.tika.parser.Parser)18 TikaInputStream (org.apache.tika.io.TikaInputStream)17 ParseContext (org.apache.tika.parser.ParseContext)17 TikaException (org.apache.tika.exception.TikaException)14 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)14 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 Detector (org.apache.tika.detect.Detector)11 TikaTest (org.apache.tika.TikaTest)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 ArrayList (java.util.ArrayList)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7