Search in sources :

Example 81 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class MP4ParserTest method testMP4ParsingAudio.

/**
     * Test that we can extract information from
     *  a M4A MP4 Audio file
     */
@Test
public void testMP4ParsingAudio() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = MP4ParserTest.class.getResourceAsStream("/test-documents/testMP4.m4a")) {
        parser.parse(stream, handler, metadata, new ParseContext());
    }
    // Check core properties
    assertEquals("audio/mp4", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
    assertEquals("2012-01-28T18:39:18Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2012-01-28T18:39:18Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2012-01-28T18:40:25Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("2012-01-28T18:40:25Z", metadata.get(Metadata.DATE));
    // Check the textual contents
    String content = handler.toString();
    assertContains("Test Title", content);
    assertContains("Test Artist", content);
    assertContains("Test Album", content);
    assertContains("2008", content);
    assertContains("Test Comment", content);
    assertContains("Test Genre", content);
    // Check XMPDM-typed audio properties
    assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
    assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
    assertEquals("Test Composer", metadata.get(XMPDM.COMPOSER));
    assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
    assertEquals("Test Genre", metadata.get(XMPDM.GENRE));
    assertEquals("Test Comments", metadata.get(XMPDM.LOG_COMMENT.getName()));
    assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
    assertEquals("Test Album Artist", metadata.get(XMPDM.ALBUM_ARTIST));
    assertEquals("6", metadata.get(XMPDM.DISC_NUMBER));
    assertEquals("0", metadata.get(XMPDM.COMPILATION));
    assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
    assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
    assertEquals("M4A", metadata.get(XMPDM.AUDIO_COMPRESSOR));
    assertEquals("0.07", metadata.get(XMPDM.DURATION));
    assertEquals("iTunes 10.5.3.3", metadata.get(XMP.CREATOR_TOOL));
    // Check again by file, rather than stream
    TikaInputStream tstream = TikaInputStream.get(MP4ParserTest.class.getResourceAsStream("/test-documents/testMP4.m4a"));
    tstream.getFile();
    try {
        parser.parse(tstream, handler, metadata, new ParseContext());
    } finally {
        tstream.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 82 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class MP4Parser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // The MP4Parser library accepts either a File, or a byte array
    // As MP4 video files are typically large, always use a file to
    //  avoid OOMs that may occur with in-memory buffering
    TemporaryResources tmp = new TemporaryResources();
    TikaInputStream tstream = TikaInputStream.get(stream, tmp);
    try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) {
        try (IsoFile isoFile = new IsoFile(dataSource)) {
            tmp.addResource(isoFile);
            // Grab the file type box
            FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
            if (fileType != null) {
                // Identify the type
                MediaType type = MediaType.application("mp4");
                for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
                    if (e.getValue().contains(fileType.getMajorBrand())) {
                        type = e.getKey();
                        break;
                    }
                }
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
                if (type.getType().equals("audio")) {
                    metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
                }
            } else {
                // Some older QuickTime files lack the FileType
                metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
            }
            // Get the main MOOV box
            MovieBox moov = getOrNull(isoFile, MovieBox.class);
            if (moov == null) {
                // Bail out
                return;
            }
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            // Pull out some information from the header box
            MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
            if (mHeader != null) {
                // Get the creation and modification dates
                metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
                metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
                // Get the duration
                double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
                metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
                // The timescale is normally the sampling rate
                metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
            }
            // Get some more information from the track header
            // TODO Decide how to handle multiple tracks
            List<TrackBox> tb = moov.getBoxes(TrackBox.class);
            if (tb.size() > 0) {
                TrackBox track = tb.get(0);
                TrackHeaderBox header = track.getTrackHeaderBox();
                // Get the creation and modification dates
                metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
                metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
                // Get the video with and height
                metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
                metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
                // Get the sample information
                SampleTableBox samples = track.getSampleTableBox();
                SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
                if (sampleDesc != null) {
                    // Look for the first Audio Sample, if present
                    AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
                    if (sample != null) {
                        XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
                        //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
                        metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
                    //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
                    //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
                    }
                }
            }
            // Get metadata from the User Data Box
            UserDataBox userData = getOrNull(moov, UserDataBox.class);
            if (userData != null) {
                MetaBox meta = getOrNull(userData, MetaBox.class);
                // Check for iTunes Metadata
                // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
                //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
                AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
                if (apple != null) {
                    // Title
                    AppleNameBox title = getOrNull(apple, AppleNameBox.class);
                    addMetadata(TikaCoreProperties.TITLE, metadata, title);
                    // Artist
                    AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
                    addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
                    addMetadata(XMPDM.ARTIST, metadata, artist);
                    // Album Artist
                    AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
                    addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
                    // Album
                    AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
                    addMetadata(XMPDM.ALBUM, metadata, album);
                    // Composer
                    AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
                    addMetadata(XMPDM.COMPOSER, metadata, composer);
                    // Genre
                    AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
                    addMetadata(XMPDM.GENRE, metadata, genre);
                    // Year
                    AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
                    if (year != null) {
                        metadata.set(XMPDM.RELEASE_DATE, year.getValue());
                    }
                    // Track number
                    AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
                    if (trackNum != null) {
                        metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
                    //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
                    }
                    // Disc number
                    AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
                    if (discNum != null) {
                        metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
                    }
                    // Compilation
                    AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
                    if (compilation != null) {
                        metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
                    }
                    // Comment
                    AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
                    addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
                    // Encoder
                    AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
                    if (encoder != null) {
                        metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
                    }
                    // As text
                    for (Box box : apple.getBoxes()) {
                        if (box instanceof Utf8AppleDataBox) {
                            xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
                        }
                    }
                }
            // TODO Check for other kinds too
            }
            // All done
            xhtml.endDocument();
        }
    } finally {
        tmp.dispose();
    }
}
Also used : AudioSampleEntry(com.coremedia.iso.boxes.sampleentry.AudioSampleEntry) AppleAlbumBox(com.googlecode.mp4parser.boxes.apple.AppleAlbumBox) TikaInputStream(org.apache.tika.io.TikaInputStream) FileTypeBox(com.coremedia.iso.boxes.FileTypeBox) AppleTrackNumberBox(com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox) MetaBox(com.coremedia.iso.boxes.MetaBox) AppleCompilationBox(com.googlecode.mp4parser.boxes.apple.AppleCompilationBox) AppleArtist2Box(com.googlecode.mp4parser.boxes.apple.AppleArtist2Box) AppleRecordingYear2Box(com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box) AppleGenreBox(com.googlecode.mp4parser.boxes.apple.AppleGenreBox) Utf8AppleDataBox(com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox) MediaType(org.apache.tika.mime.MediaType) List(java.util.List) SampleDescriptionBox(com.coremedia.iso.boxes.SampleDescriptionBox) TrackHeaderBox(com.coremedia.iso.boxes.TrackHeaderBox) IsoFile(com.coremedia.iso.IsoFile) AppleCommentBox(com.googlecode.mp4parser.boxes.apple.AppleCommentBox) UserDataBox(com.coremedia.iso.boxes.UserDataBox) MovieHeaderBox(com.coremedia.iso.boxes.MovieHeaderBox) TemporaryResources(org.apache.tika.io.TemporaryResources) AppleEncoderBox(com.googlecode.mp4parser.boxes.apple.AppleEncoderBox) AppleArtistBox(com.googlecode.mp4parser.boxes.apple.AppleArtistBox) AppleCompilationBox(com.googlecode.mp4parser.boxes.apple.AppleCompilationBox) UserDataBox(com.coremedia.iso.boxes.UserDataBox) MovieHeaderBox(com.coremedia.iso.boxes.MovieHeaderBox) AppleArtist2Box(com.googlecode.mp4parser.boxes.apple.AppleArtist2Box) AppleArtistBox(com.googlecode.mp4parser.boxes.apple.AppleArtistBox) AppleEncoderBox(com.googlecode.mp4parser.boxes.apple.AppleEncoderBox) AppleTrackNumberBox(com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox) AppleNameBox(com.googlecode.mp4parser.boxes.apple.AppleNameBox) SampleTableBox(com.coremedia.iso.boxes.SampleTableBox) TrackBox(com.coremedia.iso.boxes.TrackBox) AppleDiskNumberBox(com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox) AppleRecordingYear2Box(com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box) AppleGenreBox(com.googlecode.mp4parser.boxes.apple.AppleGenreBox) MetaBox(com.coremedia.iso.boxes.MetaBox) MovieBox(com.coremedia.iso.boxes.MovieBox) Utf8AppleDataBox(com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox) AppleCommentBox(com.googlecode.mp4parser.boxes.apple.AppleCommentBox) Box(com.coremedia.iso.boxes.Box) SampleDescriptionBox(com.coremedia.iso.boxes.SampleDescriptionBox) FileTypeBox(com.coremedia.iso.boxes.FileTypeBox) AppleAlbumBox(com.googlecode.mp4parser.boxes.apple.AppleAlbumBox) TrackHeaderBox(com.coremedia.iso.boxes.TrackHeaderBox) AppleItemListBox(com.coremedia.iso.boxes.apple.AppleItemListBox) AppleTrackAuthorBox(com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox) AppleDiskNumberBox(com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) DataSource(com.googlecode.mp4parser.DataSource) SampleTableBox(com.coremedia.iso.boxes.SampleTableBox) TrackBox(com.coremedia.iso.boxes.TrackBox) AppleTrackAuthorBox(com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox) MovieBox(com.coremedia.iso.boxes.MovieBox) AppleItemListBox(com.coremedia.iso.boxes.apple.AppleItemListBox) Map(java.util.Map) HashMap(java.util.HashMap) AppleNameBox(com.googlecode.mp4parser.boxes.apple.AppleNameBox)

Example 83 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class OOXMLExtractorFactory method parse.

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);
    try {
        OOXMLExtractor extractor;
        OPCPackage pkg;
        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }
        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }
        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }
        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);
        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    }
}
Also used : Locale(java.util.Locale) TikaInputStream(org.apache.tika.io.TikaInputStream) XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) MediaType(org.apache.tika.mime.MediaType) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) XSLFEventBasedPowerPointExtractor(org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor) TikaException(org.apache.tika.exception.TikaException) XSSFBEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIXMLDocument(org.apache.poi.POIXMLDocument) POIXMLTextExtractor(org.apache.poi.POIXMLTextExtractor) XmlException(org.apache.xmlbeans.XmlException) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 84 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class BinaryDataHandler method endPart.

@Override
public void endPart() throws SAXException, TikaException {
    if (hasData()) {
        EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
        Metadata embeddedMetadata = new Metadata();
        try (TikaInputStream stream = TikaInputStream.get(getInputStream())) {
            embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false);
        } catch (IOException e) {
            throw new TikaException("error in finishing part", e);
        }
        buffer.setLength(0);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException)

Example 85 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class TesseractOCRParser method parse.

public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);
    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }
}
Also used : FileOutputStream(java.io.FileOutputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) File(java.io.File) BufferedImage(java.awt.image.BufferedImage)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6