use of org.apache.tika.mime.MediaType in project tika by apache.
the class MP4Parser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// The MP4Parser library accepts either a File, or a byte array
// As MP4 video files are typically large, always use a file to
// avoid OOMs that may occur with in-memory buffering
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) {
try (IsoFile isoFile = new IsoFile(dataSource)) {
tmp.addResource(isoFile);
// Grab the file type box
FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
if (fileType != null) {
// Identify the type
MediaType type = MediaType.application("mp4");
for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
if (e.getValue().contains(fileType.getMajorBrand())) {
type = e.getKey();
break;
}
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
if (type.getType().equals("audio")) {
metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
}
} else {
// Some older QuickTime files lack the FileType
metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
}
// Get the main MOOV box
MovieBox moov = getOrNull(isoFile, MovieBox.class);
if (moov == null) {
// Bail out
return;
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// Pull out some information from the header box
MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
if (mHeader != null) {
// Get the creation and modification dates
metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
// Get the duration
double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
// The timescale is normally the sampling rate
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
}
// Get some more information from the track header
// TODO Decide how to handle multiple tracks
List<TrackBox> tb = moov.getBoxes(TrackBox.class);
if (tb.size() > 0) {
TrackBox track = tb.get(0);
TrackHeaderBox header = track.getTrackHeaderBox();
// Get the creation and modification dates
metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
// Get the video with and height
metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
// Get the sample information
SampleTableBox samples = track.getSampleTableBox();
SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
if (sampleDesc != null) {
// Look for the first Audio Sample, if present
AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
if (sample != null) {
XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
//metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
//metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
//metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
}
}
}
// Get metadata from the User Data Box
UserDataBox userData = getOrNull(moov, UserDataBox.class);
if (userData != null) {
MetaBox meta = getOrNull(userData, MetaBox.class);
// Check for iTunes Metadata
// See http://atomicparsley.sourceforge.net/mpeg-4files.html and
// http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
if (apple != null) {
// Title
AppleNameBox title = getOrNull(apple, AppleNameBox.class);
addMetadata(TikaCoreProperties.TITLE, metadata, title);
// Artist
AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
addMetadata(XMPDM.ARTIST, metadata, artist);
// Album Artist
AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
// Album
AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
addMetadata(XMPDM.ALBUM, metadata, album);
// Composer
AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
addMetadata(XMPDM.COMPOSER, metadata, composer);
// Genre
AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
addMetadata(XMPDM.GENRE, metadata, genre);
// Year
AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
if (year != null) {
metadata.set(XMPDM.RELEASE_DATE, year.getValue());
}
// Track number
AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
if (trackNum != null) {
metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
//metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
}
// Disc number
AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
if (discNum != null) {
metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
}
// Compilation
AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
if (compilation != null) {
metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
}
// Comment
AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
// Encoder
AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
if (encoder != null) {
metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
}
// As text
for (Box box : apple.getBoxes()) {
if (box instanceof Utf8AppleDataBox) {
xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
}
}
}
// TODO Check for other kinds too
}
// All done
xhtml.endDocument();
}
} finally {
tmp.dispose();
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class OOXMLExtractorFactory method parse.
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
OOXMLExtractor extractor;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
pkg = OPCPackage.open(shield);
}
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
} else {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class CompressorParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// should not be closed
if (stream.markSupported()) {
stream = new CloseShieldInputStream(stream);
} else {
// Ensure that the stream supports the mark feature
stream = new BufferedInputStream(new CloseShieldInputStream(stream));
}
CompressorInputStream cis;
try {
CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() {
public boolean decompressConcatenated(Metadata metadata) {
return false;
}
});
CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
cis = factory.createCompressorInputStream(stream);
} catch (CompressorException e) {
if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
throw new TikaMemoryLimitException(e.getMessage());
}
throw new TikaException("Unable to uncompress document stream", e);
}
MediaType type = getMediaType(cis);
if (!type.equals(MediaType.OCTET_STREAM)) {
metadata.set(CONTENT_TYPE, type.toString());
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
Metadata entrydata = new Metadata();
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
if (name.endsWith(".tbz")) {
name = name.substring(0, name.length() - 4) + ".tar";
} else if (name.endsWith(".tbz2")) {
name = name.substring(0, name.length() - 5) + ".tar";
} else if (name.endsWith(".bz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".bz2")) {
name = name.substring(0, name.length() - 4);
} else if (name.endsWith(".xz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".zlib")) {
name = name.substring(0, name.length() - 5);
} else if (name.endsWith(".pack")) {
name = name.substring(0, name.length() - 5);
} else if (name.length() > 0) {
name = GzipUtils.getUncompressedFilename(name);
}
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(cis, xhtml, entrydata, true);
}
} finally {
cis.close();
}
xhtml.endDocument();
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class ZipContainerDetector method detectOPCBased.
private static MediaType detectOPCBased(TikaInputStream stream) {
try {
// if (zip.getEntry("_rels/.rels") != null
// || zip.getEntry("[Content_Types].xml") != null) {
// Use POI to open and investigate it for us
OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
stream.setOpenContainer(pkg);
// Is at an OOXML format?
MediaType type = detectOfficeOpenXML(pkg);
if (type != null)
return type;
// Is it XPS format?
type = detectXPSOPC(pkg);
if (type != null)
return type;
// Is it an AutoCAD format?
type = detectAutoCADOPC(pkg);
if (type != null)
return type;
// We don't know what it is, sorry
return null;
} catch (IOException e) {
return null;
} catch (RuntimeException e) {
return null;
} catch (InvalidFormatException e) {
return null;
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TXTParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
//try to get detected content type; could be a subclass of text/plain
//such as vcal, etc.
String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
MediaType mediaType = MediaType.TEXT_PLAIN;
if (incomingMime != null) {
MediaType tmpMediaType = MediaType.parse(incomingMime);
if (tmpMediaType != null) {
mediaType = tmpMediaType;
}
}
Charset charset = reader.getCharset();
MediaType type = new MediaType(mediaType, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
}
}
Aggregations