Search in sources :

Example 76 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class CompositeParserTest method testMimeTypeAliases.

@Test
public void testMimeTypeAliases() throws Exception {
    MediaType bmpCanonical = MediaType.image("bmp");
    Map<String, String> bmpCanonicalMetadata = new HashMap<String, String>();
    bmpCanonicalMetadata.put("BMP", "True");
    bmpCanonicalMetadata.put("Canonical", "True");
    Parser bmpCanonicalParser = new DummyParser(new HashSet<MediaType>(Arrays.asList(bmpCanonical)), bmpCanonicalMetadata, null);
    MediaType bmpAlias = MediaType.image("x-ms-bmp");
    Map<String, String> bmpAliasMetadata = new HashMap<String, String>();
    bmpAliasMetadata.put("BMP", "True");
    bmpAliasMetadata.put("Alias", "True");
    Parser bmpAliasParser = new DummyParser(new HashSet<MediaType>(Arrays.asList(bmpAlias)), bmpAliasMetadata, null);
    TikaConfig config = TikaConfig.getDefaultConfig();
    CompositeParser canonical = new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser);
    CompositeParser alias = new CompositeParser(config.getMediaTypeRegistry(), bmpAliasParser);
    CompositeParser both = new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata;
    // Canonical and Canonical
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
    canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Canonical"));
    // Alias and Alias
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
    alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Alias"));
    // Alias type and Canonical parser
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
    canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Canonical"));
    // Canonical type and Alias parser
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
    alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Alias"));
    // And when both are there, will go for the last one
    //  to be registered (which is the alias one)
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
    both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Alias"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaConfig(org.apache.tika.config.TikaConfig) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test)

Example 77 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class MP4Parser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // The MP4Parser library accepts either a File, or a byte array
    // As MP4 video files are typically large, always use a file to
    //  avoid OOMs that may occur with in-memory buffering
    TemporaryResources tmp = new TemporaryResources();
    TikaInputStream tstream = TikaInputStream.get(stream, tmp);
    try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) {
        try (IsoFile isoFile = new IsoFile(dataSource)) {
            tmp.addResource(isoFile);
            // Grab the file type box
            FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
            if (fileType != null) {
                // Identify the type
                MediaType type = MediaType.application("mp4");
                for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
                    if (e.getValue().contains(fileType.getMajorBrand())) {
                        type = e.getKey();
                        break;
                    }
                }
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
                if (type.getType().equals("audio")) {
                    metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
                }
            } else {
                // Some older QuickTime files lack the FileType
                metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
            }
            // Get the main MOOV box
            MovieBox moov = getOrNull(isoFile, MovieBox.class);
            if (moov == null) {
                // Bail out
                return;
            }
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            // Pull out some information from the header box
            MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
            if (mHeader != null) {
                // Get the creation and modification dates
                metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
                metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
                // Get the duration
                double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
                metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
                // The timescale is normally the sampling rate
                metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
            }
            // Get some more information from the track header
            // TODO Decide how to handle multiple tracks
            List<TrackBox> tb = moov.getBoxes(TrackBox.class);
            if (tb.size() > 0) {
                TrackBox track = tb.get(0);
                TrackHeaderBox header = track.getTrackHeaderBox();
                // Get the creation and modification dates
                metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
                metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
                // Get the video with and height
                metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
                metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
                // Get the sample information
                SampleTableBox samples = track.getSampleTableBox();
                SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
                if (sampleDesc != null) {
                    // Look for the first Audio Sample, if present
                    AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
                    if (sample != null) {
                        XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
                        //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
                        metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
                    //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
                    //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
                    }
                }
            }
            // Get metadata from the User Data Box
            UserDataBox userData = getOrNull(moov, UserDataBox.class);
            if (userData != null) {
                MetaBox meta = getOrNull(userData, MetaBox.class);
                // Check for iTunes Metadata
                // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
                //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
                AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
                if (apple != null) {
                    // Title
                    AppleNameBox title = getOrNull(apple, AppleNameBox.class);
                    addMetadata(TikaCoreProperties.TITLE, metadata, title);
                    // Artist
                    AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
                    addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
                    addMetadata(XMPDM.ARTIST, metadata, artist);
                    // Album Artist
                    AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
                    addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
                    // Album
                    AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
                    addMetadata(XMPDM.ALBUM, metadata, album);
                    // Composer
                    AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
                    addMetadata(XMPDM.COMPOSER, metadata, composer);
                    // Genre
                    AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
                    addMetadata(XMPDM.GENRE, metadata, genre);
                    // Year
                    AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
                    if (year != null) {
                        metadata.set(XMPDM.RELEASE_DATE, year.getValue());
                    }
                    // Track number
                    AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
                    if (trackNum != null) {
                        metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
                    //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
                    }
                    // Disc number
                    AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
                    if (discNum != null) {
                        metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
                    }
                    // Compilation
                    AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
                    if (compilation != null) {
                        metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
                    }
                    // Comment
                    AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
                    addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
                    // Encoder
                    AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
                    if (encoder != null) {
                        metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
                    }
                    // As text
                    for (Box box : apple.getBoxes()) {
                        if (box instanceof Utf8AppleDataBox) {
                            xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
                        }
                    }
                }
            // TODO Check for other kinds too
            }
            // All done
            xhtml.endDocument();
        }
    } finally {
        tmp.dispose();
    }
}
Also used : AudioSampleEntry(com.coremedia.iso.boxes.sampleentry.AudioSampleEntry) AppleAlbumBox(com.googlecode.mp4parser.boxes.apple.AppleAlbumBox) TikaInputStream(org.apache.tika.io.TikaInputStream) FileTypeBox(com.coremedia.iso.boxes.FileTypeBox) AppleTrackNumberBox(com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox) MetaBox(com.coremedia.iso.boxes.MetaBox) AppleCompilationBox(com.googlecode.mp4parser.boxes.apple.AppleCompilationBox) AppleArtist2Box(com.googlecode.mp4parser.boxes.apple.AppleArtist2Box) AppleRecordingYear2Box(com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box) AppleGenreBox(com.googlecode.mp4parser.boxes.apple.AppleGenreBox) Utf8AppleDataBox(com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox) MediaType(org.apache.tika.mime.MediaType) List(java.util.List) SampleDescriptionBox(com.coremedia.iso.boxes.SampleDescriptionBox) TrackHeaderBox(com.coremedia.iso.boxes.TrackHeaderBox) IsoFile(com.coremedia.iso.IsoFile) AppleCommentBox(com.googlecode.mp4parser.boxes.apple.AppleCommentBox) UserDataBox(com.coremedia.iso.boxes.UserDataBox) MovieHeaderBox(com.coremedia.iso.boxes.MovieHeaderBox) TemporaryResources(org.apache.tika.io.TemporaryResources) AppleEncoderBox(com.googlecode.mp4parser.boxes.apple.AppleEncoderBox) AppleArtistBox(com.googlecode.mp4parser.boxes.apple.AppleArtistBox) AppleCompilationBox(com.googlecode.mp4parser.boxes.apple.AppleCompilationBox) UserDataBox(com.coremedia.iso.boxes.UserDataBox) MovieHeaderBox(com.coremedia.iso.boxes.MovieHeaderBox) AppleArtist2Box(com.googlecode.mp4parser.boxes.apple.AppleArtist2Box) AppleArtistBox(com.googlecode.mp4parser.boxes.apple.AppleArtistBox) AppleEncoderBox(com.googlecode.mp4parser.boxes.apple.AppleEncoderBox) AppleTrackNumberBox(com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox) AppleNameBox(com.googlecode.mp4parser.boxes.apple.AppleNameBox) SampleTableBox(com.coremedia.iso.boxes.SampleTableBox) TrackBox(com.coremedia.iso.boxes.TrackBox) AppleDiskNumberBox(com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox) AppleRecordingYear2Box(com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box) AppleGenreBox(com.googlecode.mp4parser.boxes.apple.AppleGenreBox) MetaBox(com.coremedia.iso.boxes.MetaBox) MovieBox(com.coremedia.iso.boxes.MovieBox) Utf8AppleDataBox(com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox) AppleCommentBox(com.googlecode.mp4parser.boxes.apple.AppleCommentBox) Box(com.coremedia.iso.boxes.Box) SampleDescriptionBox(com.coremedia.iso.boxes.SampleDescriptionBox) FileTypeBox(com.coremedia.iso.boxes.FileTypeBox) AppleAlbumBox(com.googlecode.mp4parser.boxes.apple.AppleAlbumBox) TrackHeaderBox(com.coremedia.iso.boxes.TrackHeaderBox) AppleItemListBox(com.coremedia.iso.boxes.apple.AppleItemListBox) AppleTrackAuthorBox(com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox) AppleDiskNumberBox(com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) DataSource(com.googlecode.mp4parser.DataSource) SampleTableBox(com.coremedia.iso.boxes.SampleTableBox) TrackBox(com.coremedia.iso.boxes.TrackBox) AppleTrackAuthorBox(com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox) MovieBox(com.coremedia.iso.boxes.MovieBox) AppleItemListBox(com.coremedia.iso.boxes.apple.AppleItemListBox) Map(java.util.Map) HashMap(java.util.HashMap) AppleNameBox(com.googlecode.mp4parser.boxes.apple.AppleNameBox)

Example 78 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class OOXMLExtractorFactory method parse.

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);
    try {
        OOXMLExtractor extractor;
        OPCPackage pkg;
        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }
        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }
        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }
        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);
        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    }
}
Also used : Locale(java.util.Locale) TikaInputStream(org.apache.tika.io.TikaInputStream) XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) MediaType(org.apache.tika.mime.MediaType) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) XSLFEventBasedPowerPointExtractor(org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor) TikaException(org.apache.tika.exception.TikaException) XSSFBEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIXMLDocument(org.apache.poi.POIXMLDocument) POIXMLTextExtractor(org.apache.poi.POIXMLTextExtractor) XmlException(org.apache.xmlbeans.XmlException) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 79 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class CompressorParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // should not be closed
    if (stream.markSupported()) {
        stream = new CloseShieldInputStream(stream);
    } else {
        // Ensure that the stream supports the mark feature
        stream = new BufferedInputStream(new CloseShieldInputStream(stream));
    }
    CompressorInputStream cis;
    try {
        CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() {

            public boolean decompressConcatenated(Metadata metadata) {
                return false;
            }
        });
        CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
        cis = factory.createCompressorInputStream(stream);
    } catch (CompressorException e) {
        if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
            throw new TikaMemoryLimitException(e.getMessage());
        }
        throw new TikaException("Unable to uncompress document stream", e);
    }
    MediaType type = getMediaType(cis);
    if (!type.equals(MediaType.OCTET_STREAM)) {
        metadata.set(CONTENT_TYPE, type.toString());
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        Metadata entrydata = new Metadata();
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (name != null) {
            if (name.endsWith(".tbz")) {
                name = name.substring(0, name.length() - 4) + ".tar";
            } else if (name.endsWith(".tbz2")) {
                name = name.substring(0, name.length() - 5) + ".tar";
            } else if (name.endsWith(".bz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".bz2")) {
                name = name.substring(0, name.length() - 4);
            } else if (name.endsWith(".xz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".zlib")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.endsWith(".pack")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.length() > 0) {
                name = GzipUtils.getUncompressedFilename(name);
            }
            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
        }
        // Use the delegate parser to parse the compressed document
        EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        if (extractor.shouldParseEmbedded(entrydata)) {
            extractor.parseEmbedded(cis, xhtml, entrydata, true);
        }
    } finally {
        cis.close();
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) Metadata(org.apache.tika.metadata.Metadata) CompressorStreamFactory(org.apache.commons.compress.compressors.CompressorStreamFactory) CompressorInputStream(org.apache.commons.compress.compressors.CompressorInputStream) SnappyCompressorInputStream(org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream) XZCompressorInputStream(org.apache.commons.compress.compressors.xz.XZCompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) DeflateCompressorInputStream(org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream) LZMACompressorInputStream(org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream) FramedSnappyCompressorInputStream(org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream) ZCompressorInputStream(org.apache.commons.compress.compressors.z.ZCompressorInputStream) Pack200CompressorInputStream(org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) MemoryLimitException(org.apache.commons.compress.MemoryLimitException) TikaMemoryLimitException(org.apache.tika.exception.TikaMemoryLimitException) BufferedInputStream(java.io.BufferedInputStream) CompressorException(org.apache.commons.compress.compressors.CompressorException) TikaMemoryLimitException(org.apache.tika.exception.TikaMemoryLimitException) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 80 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class ZipContainerDetector method detectOPCBased.

private static MediaType detectOPCBased(TikaInputStream stream) {
    try {
        //            if (zip.getEntry("_rels/.rels") != null
        //                  || zip.getEntry("[Content_Types].xml") != null) {
        // Use POI to open and investigate it for us
        OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
        stream.setOpenContainer(pkg);
        // Is at an OOXML format?
        MediaType type = detectOfficeOpenXML(pkg);
        if (type != null)
            return type;
        // Is it XPS format?
        type = detectXPSOPC(pkg);
        if (type != null)
            return type;
        // Is it an AutoCAD format?
        type = detectAutoCADOPC(pkg);
        if (type != null)
            return type;
        // We don't know what it is, sorry
        return null;
    } catch (IOException e) {
        return null;
    } catch (RuntimeException e) {
        return null;
    } catch (InvalidFormatException e) {
        return null;
    }
}
Also used : MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Aggregations

MediaType (org.apache.tika.mime.MediaType)95 Metadata (org.apache.tika.metadata.Metadata)29 Test (org.junit.Test)28 InputStream (java.io.InputStream)26 IOException (java.io.IOException)18 Parser (org.apache.tika.parser.Parser)18 TikaInputStream (org.apache.tika.io.TikaInputStream)17 ParseContext (org.apache.tika.parser.ParseContext)17 TikaException (org.apache.tika.exception.TikaException)14 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)14 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 Detector (org.apache.tika.detect.Detector)11 TikaTest (org.apache.tika.TikaTest)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 ArrayList (java.util.ArrayList)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7