Search in sources :

Example 56 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class Word2006MLParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    //set OfficeParserConfig if the user hasn't specified one
    configure(context);
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context))));
    } catch (SAXException e) {
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) SAXException(org.xml.sax.SAXException)

Example 57 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class AbstractXML2003Parser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    setContentType(metadata);
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endDocument();
    }
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) TaggedContentHandler(org.apache.tika.sax.TaggedContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) SAXException(org.xml.sax.SAXException)

Example 58 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class Mp3Parser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
    metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    // Create handlers for the various kinds of ID3 tags
    ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
    // Process tags metadata if the file has supported tags
    if (audioAndTags.tags.length > 0) {
        CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
        metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
        metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
        metadata.set(XMPDM.ARTIST, tag.getArtist());
        metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
        metadata.set(XMPDM.COMPOSER, tag.getComposer());
        metadata.set(XMPDM.ALBUM, tag.getAlbum());
        metadata.set(XMPDM.COMPILATION, tag.getCompilation());
        metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
        metadata.set(XMPDM.GENRE, tag.getGenre());
        List<String> comments = new ArrayList<String>();
        for (ID3Comment comment : tag.getComments()) {
            StringBuffer cmt = new StringBuffer();
            if (comment.getLanguage() != null) {
                cmt.append(comment.getLanguage());
                cmt.append(" - ");
            }
            if (comment.getDescription() != null) {
                cmt.append(comment.getDescription());
                if (comment.getText() != null) {
                    cmt.append("\n");
                }
            }
            if (comment.getText() != null) {
                cmt.append(comment.getText());
            }
            comments.add(cmt.toString());
            metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
        }
        xhtml.element("h1", tag.getTitle());
        xhtml.element("p", tag.getArtist());
        // ID3v1.1 Track addition
        StringBuilder sb = new StringBuilder();
        sb.append(tag.getAlbum());
        if (tag.getTrackNumber() != null) {
            sb.append(", track ").append(tag.getTrackNumber());
            metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
        }
        if (tag.getDisc() != null) {
            sb.append(", disc ").append(tag.getDisc());
            metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
        }
        xhtml.element("p", sb.toString());
        xhtml.element("p", tag.getYear());
        xhtml.element("p", tag.getGenre());
        xhtml.element("p", String.valueOf(audioAndTags.duration));
        for (String comment : comments) {
            xhtml.element("p", comment);
        }
    }
    if (audioAndTags.duration > 0) {
        metadata.set(XMPDM.DURATION, audioAndTags.duration);
    }
    if (audioAndTags.audio != null) {
        metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
        metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
        metadata.set("version", audioAndTags.audio.getVersion());
        metadata.set(XMPDM.AUDIO_SAMPLE_RATE, Integer.toString(audioAndTags.audio.getSampleRate()));
        if (audioAndTags.audio.getChannels() == 1) {
            metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
        } else if (audioAndTags.audio.getChannels() == 2) {
            metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
        } else if (audioAndTags.audio.getChannels() == 5) {
            metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
        } else if (audioAndTags.audio.getChannels() == 7) {
            metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
        }
    }
    if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
        xhtml.startElement("p", "class", "lyrics");
        xhtml.characters(audioAndTags.lyrics.lyricsText);
        xhtml.endElement("p");
    }
    xhtml.endDocument();
}
Also used : ArrayList(java.util.ArrayList) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) ID3Comment(org.apache.tika.parser.mp3.ID3Tags.ID3Comment)

Example 59 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class MP4Parser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // The MP4Parser library accepts either a File, or a byte array
    // As MP4 video files are typically large, always use a file to
    //  avoid OOMs that may occur with in-memory buffering
    TemporaryResources tmp = new TemporaryResources();
    TikaInputStream tstream = TikaInputStream.get(stream, tmp);
    try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) {
        try (IsoFile isoFile = new IsoFile(dataSource)) {
            tmp.addResource(isoFile);
            // Grab the file type box
            FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
            if (fileType != null) {
                // Identify the type
                MediaType type = MediaType.application("mp4");
                for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
                    if (e.getValue().contains(fileType.getMajorBrand())) {
                        type = e.getKey();
                        break;
                    }
                }
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
                if (type.getType().equals("audio")) {
                    metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
                }
            } else {
                // Some older QuickTime files lack the FileType
                metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
            }
            // Get the main MOOV box
            MovieBox moov = getOrNull(isoFile, MovieBox.class);
            if (moov == null) {
                // Bail out
                return;
            }
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            // Pull out some information from the header box
            MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
            if (mHeader != null) {
                // Get the creation and modification dates
                metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
                metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
                // Get the duration
                double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
                metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
                // The timescale is normally the sampling rate
                metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
            }
            // Get some more information from the track header
            // TODO Decide how to handle multiple tracks
            List<TrackBox> tb = moov.getBoxes(TrackBox.class);
            if (tb.size() > 0) {
                TrackBox track = tb.get(0);
                TrackHeaderBox header = track.getTrackHeaderBox();
                // Get the creation and modification dates
                metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
                metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
                // Get the video with and height
                metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
                metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
                // Get the sample information
                SampleTableBox samples = track.getSampleTableBox();
                SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
                if (sampleDesc != null) {
                    // Look for the first Audio Sample, if present
                    AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
                    if (sample != null) {
                        XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
                        //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
                        metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
                    //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
                    //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
                    }
                }
            }
            // Get metadata from the User Data Box
            UserDataBox userData = getOrNull(moov, UserDataBox.class);
            if (userData != null) {
                MetaBox meta = getOrNull(userData, MetaBox.class);
                // Check for iTunes Metadata
                // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
                //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
                AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
                if (apple != null) {
                    // Title
                    AppleNameBox title = getOrNull(apple, AppleNameBox.class);
                    addMetadata(TikaCoreProperties.TITLE, metadata, title);
                    // Artist
                    AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
                    addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
                    addMetadata(XMPDM.ARTIST, metadata, artist);
                    // Album Artist
                    AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
                    addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
                    // Album
                    AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
                    addMetadata(XMPDM.ALBUM, metadata, album);
                    // Composer
                    AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
                    addMetadata(XMPDM.COMPOSER, metadata, composer);
                    // Genre
                    AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
                    addMetadata(XMPDM.GENRE, metadata, genre);
                    // Year
                    AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
                    if (year != null) {
                        metadata.set(XMPDM.RELEASE_DATE, year.getValue());
                    }
                    // Track number
                    AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
                    if (trackNum != null) {
                        metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
                    //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
                    }
                    // Disc number
                    AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
                    if (discNum != null) {
                        metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
                    }
                    // Compilation
                    AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
                    if (compilation != null) {
                        metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
                    }
                    // Comment
                    AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
                    addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
                    // Encoder
                    AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
                    if (encoder != null) {
                        metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
                    }
                    // As text
                    for (Box box : apple.getBoxes()) {
                        if (box instanceof Utf8AppleDataBox) {
                            xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
                        }
                    }
                }
            // TODO Check for other kinds too
            }
            // All done
            xhtml.endDocument();
        }
    } finally {
        tmp.dispose();
    }
}
Also used : AudioSampleEntry(com.coremedia.iso.boxes.sampleentry.AudioSampleEntry) AppleAlbumBox(com.googlecode.mp4parser.boxes.apple.AppleAlbumBox) TikaInputStream(org.apache.tika.io.TikaInputStream) FileTypeBox(com.coremedia.iso.boxes.FileTypeBox) AppleTrackNumberBox(com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox) MetaBox(com.coremedia.iso.boxes.MetaBox) AppleCompilationBox(com.googlecode.mp4parser.boxes.apple.AppleCompilationBox) AppleArtist2Box(com.googlecode.mp4parser.boxes.apple.AppleArtist2Box) AppleRecordingYear2Box(com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box) AppleGenreBox(com.googlecode.mp4parser.boxes.apple.AppleGenreBox) Utf8AppleDataBox(com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox) MediaType(org.apache.tika.mime.MediaType) List(java.util.List) SampleDescriptionBox(com.coremedia.iso.boxes.SampleDescriptionBox) TrackHeaderBox(com.coremedia.iso.boxes.TrackHeaderBox) IsoFile(com.coremedia.iso.IsoFile) AppleCommentBox(com.googlecode.mp4parser.boxes.apple.AppleCommentBox) UserDataBox(com.coremedia.iso.boxes.UserDataBox) MovieHeaderBox(com.coremedia.iso.boxes.MovieHeaderBox) TemporaryResources(org.apache.tika.io.TemporaryResources) AppleEncoderBox(com.googlecode.mp4parser.boxes.apple.AppleEncoderBox) AppleArtistBox(com.googlecode.mp4parser.boxes.apple.AppleArtistBox) AppleCompilationBox(com.googlecode.mp4parser.boxes.apple.AppleCompilationBox) UserDataBox(com.coremedia.iso.boxes.UserDataBox) MovieHeaderBox(com.coremedia.iso.boxes.MovieHeaderBox) AppleArtist2Box(com.googlecode.mp4parser.boxes.apple.AppleArtist2Box) AppleArtistBox(com.googlecode.mp4parser.boxes.apple.AppleArtistBox) AppleEncoderBox(com.googlecode.mp4parser.boxes.apple.AppleEncoderBox) AppleTrackNumberBox(com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox) AppleNameBox(com.googlecode.mp4parser.boxes.apple.AppleNameBox) SampleTableBox(com.coremedia.iso.boxes.SampleTableBox) TrackBox(com.coremedia.iso.boxes.TrackBox) AppleDiskNumberBox(com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox) AppleRecordingYear2Box(com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box) AppleGenreBox(com.googlecode.mp4parser.boxes.apple.AppleGenreBox) MetaBox(com.coremedia.iso.boxes.MetaBox) MovieBox(com.coremedia.iso.boxes.MovieBox) Utf8AppleDataBox(com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox) AppleCommentBox(com.googlecode.mp4parser.boxes.apple.AppleCommentBox) Box(com.coremedia.iso.boxes.Box) SampleDescriptionBox(com.coremedia.iso.boxes.SampleDescriptionBox) FileTypeBox(com.coremedia.iso.boxes.FileTypeBox) AppleAlbumBox(com.googlecode.mp4parser.boxes.apple.AppleAlbumBox) TrackHeaderBox(com.coremedia.iso.boxes.TrackHeaderBox) AppleItemListBox(com.coremedia.iso.boxes.apple.AppleItemListBox) AppleTrackAuthorBox(com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox) AppleDiskNumberBox(com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) DataSource(com.googlecode.mp4parser.DataSource) SampleTableBox(com.coremedia.iso.boxes.SampleTableBox) TrackBox(com.coremedia.iso.boxes.TrackBox) AppleTrackAuthorBox(com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox) MovieBox(com.coremedia.iso.boxes.MovieBox) AppleItemListBox(com.coremedia.iso.boxes.apple.AppleItemListBox) Map(java.util.Map) HashMap(java.util.HashMap) AppleNameBox(com.googlecode.mp4parser.boxes.apple.AppleNameBox)

Example 60 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class NamedEntityParser method parse.

public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
    if (!initialized) {
        initialize(parseContext);
    }
    if (!available) {
        return;
    }
    Reader reader = MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE)) ? new InputStreamReader(inputStream, StandardCharsets.UTF_8) : secondaryParser.parse(inputStream);
    String text = IOUtils.toString(reader);
    IOUtils.closeQuietly(reader);
    for (NERecogniser ner : nerChain) {
        Map<String, Set<String>> names = ner.recognise(text);
        if (names != null) {
            for (Map.Entry<String, Set<String>> entry : names.entrySet()) {
                if (entry.getValue() != null) {
                    String mdKey = MD_KEY_PREFIX + entry.getKey();
                    for (String name : entry.getValue()) {
                        metadata.add(mdKey, name);
                    }
                }
            }
        }
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata);
    extractOutput(text.trim(), xhtml);
}
Also used : OpenNLPNERecogniser(org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser) RegexNERecogniser(org.apache.tika.parser.ner.regex.RegexNERecogniser) HashSet(java.util.HashSet) Set(java.util.Set) InputStreamReader(java.io.InputStreamReader) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) Map(java.util.Map)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3