use of org.apache.tika.mime.MediaType in project tika by apache.
the class AutoDetectParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// TIKA-216: Zip bomb prevention
SecureContentHandler sch = handler != null ? new SecureContentHandler(handler, tis) : null;
//the caller hasn't specified one.
if (context.get(EmbeddedDocumentExtractor.class) == null) {
Parser p = context.get(Parser.class);
if (p == null) {
context.set(Parser.class, this);
}
context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));
}
try {
// Parse the document
super.parse(tis, sch, metadata, context);
} catch (SAXException e) {
// Convert zip bomb exceptions to TikaExceptions
sch.throwIfCauseOf(e);
throw e;
}
} finally {
tmp.dispose();
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class CompositeParser method findDuplicateParsers.
/**
* Utility method that goes through all the component parsers and finds
* all media types for which more than one parser declares support. This
* is useful in tracking down conflicting parser definitions.
*
* @since Apache Tika 0.10
* @see <a href="https://issues.apache.org/jira/browse/TIKA-660">TIKA-660</a>
* @param context parsing context
* @return media types that are supported by at least two component parsers
*/
public Map<MediaType, List<Parser>> findDuplicateParsers(ParseContext context) {
Map<MediaType, Parser> types = new HashMap<MediaType, Parser>();
Map<MediaType, List<Parser>> duplicates = new HashMap<MediaType, List<Parser>>();
for (Parser parser : parsers) {
for (MediaType type : parser.getSupportedTypes(context)) {
MediaType canonicalType = registry.normalize(type);
if (types.containsKey(canonicalType)) {
List<Parser> list = duplicates.get(canonicalType);
if (list == null) {
list = new ArrayList<Parser>();
list.add(types.get(canonicalType));
duplicates.put(canonicalType, list);
}
list.add(parser);
} else {
types.put(canonicalType, parser);
}
}
}
return duplicates;
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class ExternalParsersConfigReader method readMimeTypes.
private static Set<MediaType> readMimeTypes(Element mimeTypes) {
Set<MediaType> types = new HashSet<MediaType>();
NodeList children = mimeTypes.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
Node node = children.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element child = (Element) node;
if (child.getTagName().equals(MIMETYPE_TAG)) {
types.add(MediaType.parse(getString(child)));
}
}
}
return types;
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class ExternalParsersFactory method attachExternalParsers.
public static void attachExternalParsers(List<ExternalParser> parsers, TikaConfig config) {
Parser parser = config.getParser();
if (parser instanceof CompositeParser) {
CompositeParser cParser = (CompositeParser) parser;
Map<MediaType, Parser> parserMap = cParser.getParsers();
}
// TODO
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class ZipContainerDetector method detectZipFormat.
private static MediaType detectZipFormat(TikaInputStream tis) {
try {
//try opc first because opening a package
//will not necessarily throw an exception for
//truncated files.
MediaType type = detectOPCBased(tis);
if (type != null) {
return type;
}
// TODO: hasFile()?
ZipFile zip = new ZipFile(tis.getFile());
try {
type = detectOpenDocument(zip);
if (type == null) {
type = detectIWork13(zip);
}
if (type == null) {
type = detectIWork(zip);
}
if (type == null) {
type = detectJar(zip);
}
if (type == null) {
type = detectKmz(zip);
}
if (type == null) {
type = detectIpa(zip);
}
if (type != null) {
return type;
}
} finally {
// tis.setOpenContainer(zip);
try {
zip.close();
} catch (IOException e) {
// ignore
}
}
} catch (IOException e) {
// ignore
}
// Fallback: it's still a zip file, we just don't know what kind of one
return MediaType.APPLICATION_ZIP;
}
Aggregations