use of org.apache.tika.detect.XmlRootExtractor in project tika by apache.
the class EncryptedPrescriptionDetector method detect.
public MediaType detect(InputStream stream, Metadata metadata) throws IOException {
Key key = Pharmacy.getKey();
MediaType type = MediaType.OCTET_STREAM;
try (InputStream lookahead = new LookaheadInputStream(stream, 1024)) {
Cipher cipher = Cipher.getInstance("RSA");
cipher.init(Cipher.DECRYPT_MODE, key);
InputStream decrypted = new CipherInputStream(lookahead, cipher);
QName name = new XmlRootExtractor().extractRootElement(decrypted);
if (name != null && "http://example.com/xpd".equals(name.getNamespaceURI()) && "prescription".equals(name.getLocalPart())) {
type = MediaType.application("x-prescription");
}
} catch (GeneralSecurityException e) {
// unable to decrypt, fall through
}
return type;
}
use of org.apache.tika.detect.XmlRootExtractor in project tika by apache.
the class MimeTypes method getMimeType.
/**
* Returns the MIME type that best matches the given first few bytes
* of a document stream. Returns application/octet-stream if no better
* match is found.
* <p>
* If multiple matches are found, the best (highest priority) matching
* type is returned. If multiple matches are found with the same priority,
* then all of these are returned.
* <p>
* The given byte array is expected to be at least {@link #getMinLength()}
* long, or shorter only if the document stream itself is shorter.
*
* @param data first few bytes of a document stream
* @return matching MIME type
*/
List<MimeType> getMimeType(byte[] data) {
if (data == null) {
throw new IllegalArgumentException("Data is missing");
} else if (data.length == 0) {
// See https://issues.apache.org/jira/browse/TIKA-483
return rootMimeTypeL;
}
// Then, check for magic bytes
List<MimeType> result = new ArrayList<MimeType>(1);
int currentPriority = -1;
for (Magic magic : magics) {
if (currentPriority > 0 && currentPriority > magic.getPriority()) {
break;
}
if (magic.eval(data)) {
result.add(magic.getType());
currentPriority = magic.getPriority();
}
}
if (!result.isEmpty()) {
for (int i = 0; i < result.size(); i++) {
final MimeType matched = result.get(i);
// extract the root element and match it against known types
if ("application/xml".equals(matched.getName()) || "text/html".equals(matched.getName())) {
XmlRootExtractor extractor = new XmlRootExtractor();
QName rootElement = extractor.extractRootElement(data);
if (rootElement != null) {
for (MimeType type : xmls) {
if (type.matchesXML(rootElement.getNamespaceURI(), rootElement.getLocalPart())) {
result.set(i, type);
break;
}
}
} else if ("application/xml".equals(matched.getName())) {
// Downgrade from application/xml to text/plain since
// the document seems not to be well-formed.
result.set(i, textMimeType);
}
}
}
return result;
}
// Finally, assume plain text if no control bytes are found
try {
TextDetector detector = new TextDetector(getMinLength());
ByteArrayInputStream stream = new ByteArrayInputStream(data);
MimeType type = forName(detector.detect(stream, new Metadata()).toString());
return Collections.singletonList(type);
} catch (Exception e) {
return rootMimeTypeL;
}
}
Aggregations