use of org.apache.tika.mime.MediaType in project tika by apache.
the class ExtractReader method generateListFromTextFile.
private List<Metadata> generateListFromTextFile(Reader reader, FileSuffixes fileSuffixes) throws IOException {
List<Metadata> metadataList = new ArrayList<>();
String content = IOUtils.toString(reader);
Metadata m = new Metadata();
m.set(RecursiveParserWrapper.TIKA_CONTENT, content);
//Let's hope the file name has a suffix that can
//be used to determine the mime. Could be wrong or missing,
//but better than nothing.
m.set(Metadata.RESOURCE_NAME_KEY, fileSuffixes.originalFileName);
MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m);
if (mimeType != null) {
m.set(Metadata.CONTENT_TYPE, mimeType.toString());
}
metadataList.add(m);
return metadataList;
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaParserConfigTest method testParserExcludeFromDefault.
@Test
public void testParserExcludeFromDefault() throws Exception {
TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
CompositeParser parser = (CompositeParser) config.getParser();
MediaType PE_EXE = MediaType.application("x-msdownload");
MediaType ELF = MediaType.application("x-elf");
// Get the DefaultParser from the config
ParserDecorator confWrappedParser = (ParserDecorator) parser.getParsers().get(MediaType.APPLICATION_XML);
assertNotNull(confWrappedParser);
DefaultParser confParser = (DefaultParser) confWrappedParser.getWrappedParser();
// Get a fresh "default" DefaultParser
DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry());
// The default one will offer the Executable Parser
assertContains(PE_EXE, normParser.getSupportedTypes(context));
assertContains(ELF, normParser.getSupportedTypes(context));
boolean hasExec = false;
for (Parser p : normParser.getParsers().values()) {
if (p instanceof ExecutableParser) {
hasExec = true;
break;
}
}
assertTrue(hasExec);
// The one from the config won't
assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
assertNotContained(ELF, confParser.getSupportedTypes(context));
for (Parser p : confParser.getParsers().values()) {
if (p instanceof ExecutableParser)
fail("Shouldn't have the Executable Parser from config");
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class POIContainerExtractionTest method testPowerpointImages.
@Test
public void testPowerpointImages() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
TrackingHandler handler;
handler = process("pictures.ppt", extractor, false);
assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class ExcelParserTest method testExcel95.
/**
* Excel 5 and 95 are older formats, and only get basic support
*/
@Test
public void testExcel95() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
MediaType type;
Metadata m;
// First try detection of Excel 5
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// Now Excel 95
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// OfficeParser can handle it
assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser won't handle it
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// Parse the Excel 5 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet names
assertContains("Feuil1", content);
assertContains("Feuil3", content);
// Text
assertContains("Sample Excel", content);
assertContains("Number", content);
// Numbers
assertContains("15", content);
assertContains("225", content);
// Metadata was also fetched
assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
}
// Parse the Excel 95 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet name
assertContains("Foglio1", content);
// Very boring file, no actual text or numbers!
// Metadata was also fetched
assertEquals(null, m.get(TikaCoreProperties.TITLE));
assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaCLI method displaySupportedTypes.
/**
* Prints all the known media types, aliases and matching parser classes.
*/
private void displaySupportedTypes() {
AutoDetectParser parser = new AutoDetectParser();
MediaTypeRegistry registry = parser.getMediaTypeRegistry();
Map<MediaType, Parser> parsers = parser.getParsers();
for (MediaType type : registry.getTypes()) {
System.out.println(type);
for (MediaType alias : registry.getAliases(type)) {
System.out.println(" alias: " + alias);
}
MediaType supertype = registry.getSupertype(type);
if (supertype != null) {
System.out.println(" supertype: " + supertype);
}
Parser p = parsers.get(type);
if (p != null) {
if (p instanceof CompositeParser) {
p = ((CompositeParser) p).getParsers().get(type);
}
System.out.println(" parser: " + p.getClass().getName());
}
}
}
Aggregations