use of org.apache.tika.detect.Detector in project tika by apache.
the class EmbeddedDocumentUtil method getExtension.
public String getExtension(TikaInputStream is, Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
TikaConfig config = getConfig();
MimeType mimeType = null;
MimeTypes types = config.getMimeRepository();
boolean detected = false;
if (mimeString != null) {
try {
mimeType = types.forName(mimeString);
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType == null) {
Detector detector = config.getDetector();
try {
MediaType mediaType = detector.detect(is, metadata);
mimeType = types.forName(mediaType.toString());
detected = true;
is.reset();
} catch (IOException e) {
//swallow
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType != null) {
if (detected) {
//set or correct the mime type
metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
}
return mimeType.getExtension();
}
return ".bin";
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class MyFirstTika method parseUsingComponents.
public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
MimeTypes mimeRegistry = tikaConfig.getMimeRepository();
System.out.println("Examining: [" + filename + "]");
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]");
InputStream stream = TikaInputStream.get(new File(filename));
System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]");
stream = TikaInputStream.get(new File(filename));
Detector detector = tikaConfig.getDetector();
System.out.println("The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]");
LanguageDetector langDetector = new OptimaizeLangDetector().loadModels();
LanguageResult lang = langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8));
System.out.println("The language of this content is: [" + lang.getLanguage() + "]");
// Get a non-detecting parser that handles all the types it can
Parser parser = tikaConfig.getParser();
// Tell it what we think the content is
MediaType type = detector.detect(stream, metadata);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the file parsed to get the content and metadata
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, new ParseContext());
return handler.toString();
}
use of org.apache.tika.detect.Detector in project tutorials by eugenp.
the class TikaAnalysis method detectDocTypeUsingDetector.
public static String detectDocTypeUsingDetector(InputStream stream) throws IOException {
Detector detector = new DefaultDetector();
Metadata metadata = new Metadata();
MediaType mediaType = detector.detect(stream, metadata);
return mediaType.toString();
}
use of org.apache.tika.detect.Detector in project uPortal by Jasig.
the class JaxbPortalDataHandlerService method getMediaType.
private MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException {
final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream));
try {
final Detector detector = new DefaultDetector();
final Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
final MediaType type = detector.detect(tikaInputStreamStream, metadata);
logger.debug("Determined '{}' for '{}'", type, fileName);
return type;
} catch (IOException e) {
logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
return null;
} finally {
IOUtils.closeQuietly(tikaInputStreamStream);
// Reset the buffered stream to make up for anything read by the detector
inputStream.reset();
}
}
use of org.apache.tika.detect.Detector in project ddf by codice.
the class MimeTypeMapperImpl method guessMimeType.
@Override
public String guessMimeType(InputStream is, String fileExtension) throws MimeTypeResolutionException {
LOGGER.trace(ENTERING_STR, "guessMimeType()");
String mimeType = null;
LOGGER.debug("Looping through{} MimeTypeResolvers", mimeTypeResolvers.size());
// This is to force the TikaMimeTypeResolver to be called
// after the CustomMimeTypeResolvers to prevent Tika default mapping
// from being used when a CustomMimeTypeResolver may be more appropriate.
List<MimeTypeResolver> sortedResolvers = sortResolvers(mimeTypeResolvers);
if (StringUtils.isEmpty(fileExtension)) {
try (TemporaryFileBackedOutputStream tfbos = new TemporaryFileBackedOutputStream()) {
IOUtils.copy(is, tfbos);
try (InputStream inputStream = tfbos.asByteSource().openStream()) {
Detector detector = new DefaultDetector();
MediaType mediaType = detector.detect(inputStream, new Metadata());
fileExtension = getFileExtensionForMimeType(mediaType.toString()).replace(".", "");
} finally {
is = tfbos.asByteSource().openStream();
}
} catch (Exception e) {
LOGGER.debug("Failed to guess mimeType for file without extension.");
}
}
// If file has XML extension, then read root element namespace once so
// each MimeTypeResolver does not have to open the stream and read the namespace
String namespace = null;
if (fileExtension.equals(XML_FILE_EXTENSION)) {
try {
namespace = XML_UTILS.getRootNamespace(IOUtils.toString(is));
} catch (IOException ioe) {
LOGGER.debug("Could not read namespace from input stream.", ioe);
}
LOGGER.debug("namespace = {}", namespace);
}
// Once a file extension is find for the given mime type, exit the loop.
for (MimeTypeResolver resolver : sortedResolvers) {
LOGGER.debug(CALLING_RESOLVER_MSG, resolver.getName());
try {
// an InputTransformer to create a metacard for that "generic" XML file.
if (fileExtension.equals(XML_FILE_EXTENSION)) {
if (namespace != null && resolver.hasSchema()) {
if (namespace.equals(resolver.getSchema())) {
mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
}
}
} else {
mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
}
} catch (Exception e) {
LOGGER.debug("Error resolving mime type for file extension: {}", fileExtension);
throw new MimeTypeResolutionException(e);
}
if (StringUtils.isNotEmpty(mimeType)) {
LOGGER.debug("mimeType [{}] retrieved from MimeTypeResolver: ", mimeType, resolver.getName());
break;
}
}
LOGGER.debug(MIME_TYPE_FILE_EXT_MSG, mimeType, fileExtension);
LOGGER.trace(EXITING_STR, "guessMimeType()");
return mimeType;
}
Aggregations