use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class Word2006MLParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//set OfficeParserConfig if the user hasn't specified one
configure(context);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new Word2006MLDocHandler(xhtml, metadata, context))));
} catch (SAXException e) {
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class AbstractXML2003Parser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
setContentType(metadata);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class Mp3Parser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// Create handlers for the various kinds of ID3 tags
ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
// Process tags metadata if the file has supported tags
if (audioAndTags.tags.length > 0) {
CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
metadata.set(XMPDM.ARTIST, tag.getArtist());
metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
metadata.set(XMPDM.COMPOSER, tag.getComposer());
metadata.set(XMPDM.ALBUM, tag.getAlbum());
metadata.set(XMPDM.COMPILATION, tag.getCompilation());
metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
metadata.set(XMPDM.GENRE, tag.getGenre());
List<String> comments = new ArrayList<String>();
for (ID3Comment comment : tag.getComments()) {
StringBuffer cmt = new StringBuffer();
if (comment.getLanguage() != null) {
cmt.append(comment.getLanguage());
cmt.append(" - ");
}
if (comment.getDescription() != null) {
cmt.append(comment.getDescription());
if (comment.getText() != null) {
cmt.append("\n");
}
}
if (comment.getText() != null) {
cmt.append(comment.getText());
}
comments.add(cmt.toString());
metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
}
xhtml.element("h1", tag.getTitle());
xhtml.element("p", tag.getArtist());
// ID3v1.1 Track addition
StringBuilder sb = new StringBuilder();
sb.append(tag.getAlbum());
if (tag.getTrackNumber() != null) {
sb.append(", track ").append(tag.getTrackNumber());
metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
}
if (tag.getDisc() != null) {
sb.append(", disc ").append(tag.getDisc());
metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
}
xhtml.element("p", sb.toString());
xhtml.element("p", tag.getYear());
xhtml.element("p", tag.getGenre());
xhtml.element("p", String.valueOf(audioAndTags.duration));
for (String comment : comments) {
xhtml.element("p", comment);
}
}
if (audioAndTags.duration > 0) {
metadata.set(XMPDM.DURATION, audioAndTags.duration);
}
if (audioAndTags.audio != null) {
metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
metadata.set("version", audioAndTags.audio.getVersion());
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, Integer.toString(audioAndTags.audio.getSampleRate()));
if (audioAndTags.audio.getChannels() == 1) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
} else if (audioAndTags.audio.getChannels() == 2) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
} else if (audioAndTags.audio.getChannels() == 5) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
} else if (audioAndTags.audio.getChannels() == 7) {
metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
}
}
if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
xhtml.startElement("p", "class", "lyrics");
xhtml.characters(audioAndTags.lyrics.lyricsText);
xhtml.endElement("p");
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class MP4Parser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// The MP4Parser library accepts either a File, or a byte array
// As MP4 video files are typically large, always use a file to
// avoid OOMs that may occur with in-memory buffering
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) {
try (IsoFile isoFile = new IsoFile(dataSource)) {
tmp.addResource(isoFile);
// Grab the file type box
FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
if (fileType != null) {
// Identify the type
MediaType type = MediaType.application("mp4");
for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
if (e.getValue().contains(fileType.getMajorBrand())) {
type = e.getKey();
break;
}
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
if (type.getType().equals("audio")) {
metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
}
} else {
// Some older QuickTime files lack the FileType
metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
}
// Get the main MOOV box
MovieBox moov = getOrNull(isoFile, MovieBox.class);
if (moov == null) {
// Bail out
return;
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// Pull out some information from the header box
MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
if (mHeader != null) {
// Get the creation and modification dates
metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
// Get the duration
double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
// The timescale is normally the sampling rate
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
}
// Get some more information from the track header
// TODO Decide how to handle multiple tracks
List<TrackBox> tb = moov.getBoxes(TrackBox.class);
if (tb.size() > 0) {
TrackBox track = tb.get(0);
TrackHeaderBox header = track.getTrackHeaderBox();
// Get the creation and modification dates
metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
// Get the video with and height
metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
// Get the sample information
SampleTableBox samples = track.getSampleTableBox();
SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
if (sampleDesc != null) {
// Look for the first Audio Sample, if present
AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
if (sample != null) {
XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
//metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
//metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
//metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
}
}
}
// Get metadata from the User Data Box
UserDataBox userData = getOrNull(moov, UserDataBox.class);
if (userData != null) {
MetaBox meta = getOrNull(userData, MetaBox.class);
// Check for iTunes Metadata
// See http://atomicparsley.sourceforge.net/mpeg-4files.html and
// http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
if (apple != null) {
// Title
AppleNameBox title = getOrNull(apple, AppleNameBox.class);
addMetadata(TikaCoreProperties.TITLE, metadata, title);
// Artist
AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
addMetadata(XMPDM.ARTIST, metadata, artist);
// Album Artist
AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
// Album
AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
addMetadata(XMPDM.ALBUM, metadata, album);
// Composer
AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
addMetadata(XMPDM.COMPOSER, metadata, composer);
// Genre
AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
addMetadata(XMPDM.GENRE, metadata, genre);
// Year
AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
if (year != null) {
metadata.set(XMPDM.RELEASE_DATE, year.getValue());
}
// Track number
AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
if (trackNum != null) {
metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
//metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
}
// Disc number
AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
if (discNum != null) {
metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
}
// Compilation
AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
if (compilation != null) {
metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
}
// Comment
AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
// Encoder
AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
if (encoder != null) {
metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
}
// As text
for (Box box : apple.getBoxes()) {
if (box instanceof Utf8AppleDataBox) {
xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
}
}
}
// TODO Check for other kinds too
}
// All done
xhtml.endDocument();
}
} finally {
tmp.dispose();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class NamedEntityParser method parse.
public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
if (!initialized) {
initialize(parseContext);
}
if (!available) {
return;
}
Reader reader = MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE)) ? new InputStreamReader(inputStream, StandardCharsets.UTF_8) : secondaryParser.parse(inputStream);
String text = IOUtils.toString(reader);
IOUtils.closeQuietly(reader);
for (NERecogniser ner : nerChain) {
Map<String, Set<String>> names = ner.recognise(text);
if (names != null) {
for (Map.Entry<String, Set<String>> entry : names.entrySet()) {
if (entry.getValue() != null) {
String mdKey = MD_KEY_PREFIX + entry.getKey();
for (String name : entry.getValue()) {
metadata.add(mdKey, name);
}
}
}
}
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata);
extractOutput(text.trim(), xhtml);
}
Aggregations