use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class RarParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
rar = new Archive(tis.getFile());
if (rar.isEncrypted()) {
throw new EncryptedDocumentException();
}
//Without this BodyContentHandler does not work
xhtml.element("div", " ");
FileHeader header = rar.nextFileHeader();
while (header != null && !Thread.currentThread().isInterrupted()) {
if (!header.isDirectory()) {
try (InputStream subFile = rar.getInputStream(header)) {
Metadata entrydata = PackageParser.handleEntryMetadata("".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(), header.getCTime(), header.getMTime(), header.getFullUnpackSize(), xhtml);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(subFile, handler, entrydata, true);
}
}
}
header = rar.nextFileHeader();
}
} catch (RarException e) {
throw new TikaException("RarParser Exception", e);
} finally {
if (rar != null)
rar.close();
}
xhtml.endDocument();
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class ParsingEmbeddedDocumentExtractor method parseEmbedded.
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
} catch (EncryptedDocumentException ede) {
// TODO: can we log a warning that we lack the password?
// For now, just skip the content
} catch (TikaException e) {
// TODO: can we log a warning somehow?
// Could not parse the entry, just skip the content
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class WebPParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseWebP(tis.getFile());
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class MatParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//Set MIME type as Matlab
metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
try {
// Use TIS so we can spool a temp file for parsing.
TikaInputStream tis = TikaInputStream.get(stream, tmp);
//Extract information from header file
//input .mat file
MatFileReader mfr = new MatFileReader(tis.getFile());
//.mat header information
MatFileHeader hdr = mfr.getMatFileHeader();
// Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 23:41:57 2014"
// Break header information into its parts
String[] parts = hdr.getDescription().split(",");
if (parts[2].contains("Created")) {
int lastIndex1 = parts[2].lastIndexOf("Created on:");
String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim();
metadata.set("createdOn", dateCreated);
}
if (parts[1].contains("Platform")) {
int lastIndex2 = parts[1].lastIndexOf("Platform:");
String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
metadata.set("platform", platform);
}
if (parts[0].contains("MATLAB")) {
metadata.set("fileType", parts[0]);
}
// Get endian indicator from header file
// Retrieve endian bytes and convert to string
String endianBytes = new String(hdr.getEndianIndicator(), UTF_8);
// Convert bytes to characters to string
String endianCode = String.valueOf(endianBytes.toCharArray());
metadata.set("endian", endianCode);
//Text output
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.newline();
//Loop through each variable
for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) {
String varName = entry.getKey();
MLArray varData = entry.getValue();
xhtml.element("p", varName + ":" + String.valueOf(varData));
// If the variable is a structure, extract variable info from structure
if (varData.isStruct()) {
MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
xhtml.startElement("ul");
xhtml.newline();
for (MLArray element : mlStructure.getAllFields()) {
xhtml.startElement("li");
xhtml.characters(String.valueOf(element));
// If there is an embedded structure, extract variable info.
if (element.isStruct()) {
xhtml.startElement("ul");
// Should this actually be a recursive call?
xhtml.element("li", element.contentToString());
xhtml.endElement("ul");
}
xhtml.endElement("li");
}
xhtml.endElement("ul");
}
}
xhtml.endDocument();
} catch (IOException e) {
throw new TikaException("Error parsing Matlab file with MatParser", e);
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.io.TemporaryResources in project tika by apache.
the class MP4Parser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// The MP4Parser library accepts either a File, or a byte array
// As MP4 video files are typically large, always use a file to
// avoid OOMs that may occur with in-memory buffering
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) {
try (IsoFile isoFile = new IsoFile(dataSource)) {
tmp.addResource(isoFile);
// Grab the file type box
FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
if (fileType != null) {
// Identify the type
MediaType type = MediaType.application("mp4");
for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
if (e.getValue().contains(fileType.getMajorBrand())) {
type = e.getKey();
break;
}
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
if (type.getType().equals("audio")) {
metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
}
} else {
// Some older QuickTime files lack the FileType
metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
}
// Get the main MOOV box
MovieBox moov = getOrNull(isoFile, MovieBox.class);
if (moov == null) {
// Bail out
return;
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// Pull out some information from the header box
MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
if (mHeader != null) {
// Get the creation and modification dates
metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
// Get the duration
double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
// The timescale is normally the sampling rate
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
}
// Get some more information from the track header
// TODO Decide how to handle multiple tracks
List<TrackBox> tb = moov.getBoxes(TrackBox.class);
if (tb.size() > 0) {
TrackBox track = tb.get(0);
TrackHeaderBox header = track.getTrackHeaderBox();
// Get the creation and modification dates
metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
// Get the video with and height
metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
// Get the sample information
SampleTableBox samples = track.getSampleTableBox();
SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
if (sampleDesc != null) {
// Look for the first Audio Sample, if present
AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
if (sample != null) {
XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
//metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
//metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
//metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
}
}
}
// Get metadata from the User Data Box
UserDataBox userData = getOrNull(moov, UserDataBox.class);
if (userData != null) {
MetaBox meta = getOrNull(userData, MetaBox.class);
// Check for iTunes Metadata
// See http://atomicparsley.sourceforge.net/mpeg-4files.html and
// http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
if (apple != null) {
// Title
AppleNameBox title = getOrNull(apple, AppleNameBox.class);
addMetadata(TikaCoreProperties.TITLE, metadata, title);
// Artist
AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
addMetadata(XMPDM.ARTIST, metadata, artist);
// Album Artist
AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
// Album
AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
addMetadata(XMPDM.ALBUM, metadata, album);
// Composer
AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
addMetadata(XMPDM.COMPOSER, metadata, composer);
// Genre
AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
addMetadata(XMPDM.GENRE, metadata, genre);
// Year
AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
if (year != null) {
metadata.set(XMPDM.RELEASE_DATE, year.getValue());
}
// Track number
AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
if (trackNum != null) {
metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
//metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
}
// Disc number
AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
if (discNum != null) {
metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
}
// Compilation
AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
if (compilation != null) {
metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
}
// Comment
AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
// Encoder
AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
if (encoder != null) {
metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
}
// As text
for (Box box : apple.getBoxes()) {
if (box instanceof Utf8AppleDataBox) {
xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
}
}
}
// TODO Check for other kinds too
}
// All done
xhtml.endDocument();
}
} finally {
tmp.dispose();
}
}
Aggregations