use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class ObjectRecognitionParser method parse.
@Override
public synchronized void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (!recogniser.isAvailable()) {
LOG.warn("{} is not available for service", recogniser.getClass());
return;
}
metadata.set(MD_REC_IMPL_KEY, recogniser.getClass().getName());
long start = System.currentTimeMillis();
List<RecognisedObject> objects = recogniser.recognise(stream, handler, metadata, context);
LOG.debug("Found {} objects", objects != null ? objects.size() : 0);
LOG.debug("Time taken {}ms", System.currentTimeMillis() - start);
if (objects != null && !objects.isEmpty()) {
Collections.sort(objects, DESC_CONFIDENCE_SORTER);
int count = 0;
List<RecognisedObject> acceptedObjects = new ArrayList<RecognisedObject>(topN);
// first process all the MD objects
for (RecognisedObject object : objects) {
if (object.getConfidence() >= minConfidence) {
if (object.getConfidence() >= minConfidence) {
count++;
LOG.debug("Add {}", object);
String mdValue = String.format(Locale.ENGLISH, "%s (%.5f)", object.getLabel(), object.getConfidence());
metadata.add(MD_KEY, mdValue);
acceptedObjects.add(object);
if (count >= topN) {
break;
}
} else {
LOG.warn("Object {} confidence {} less than min {}", object, object.getConfidence(), minConfidence);
}
}
}
// now the handler
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("ol", "id", "objects");
count = 0;
for (RecognisedObject object : acceptedObjects) {
//writing to handler
xhtml.startElement("li", "id", object.getId());
String text = String.format(Locale.ENGLISH, " %s [%s](confidence = %f )", object.getLabel(), object.getLabelLang(), object.getConfidence());
xhtml.characters(text);
xhtml.endElement("li");
}
xhtml.endElement("ol");
xhtml.endDocument();
} else {
LOG.warn("NO objects");
metadata.add("no.objects", Boolean.TRUE.toString());
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class StringsParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
StringsConfig stringsConfig = context.get(StringsConfig.class, DEFAULT_STRINGS_CONFIG);
FileConfig fileConfig = context.get(FileConfig.class, DEFAULT_FILE_CONFIG);
if (!hasStrings(stringsConfig)) {
return;
}
TikaInputStream tis = TikaInputStream.get(stream);
File input = tis.getFile();
// Metadata
metadata.set("strings:min-len", "" + stringsConfig.getMinLength());
metadata.set("strings:encoding", stringsConfig.toString());
metadata.set("strings:file_output", doFile(input, fileConfig));
int totalBytes = 0;
// Content
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
totalBytes = doStrings(input, stringsConfig, xhtml);
xhtml.endDocument();
// Metadata
metadata.set("strings:length", "" + totalBytes);
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class FLVParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
DataInputStream datainput = new DataInputStream(stream);
if (!checkSignature(datainput)) {
throw new TikaException("FLV signature not detected");
}
// header
int version = datainput.readUnsignedByte();
if (version != 1) {
// should be 1, perhaps this is not flv?
throw new TikaException("Unpexpected FLV version: " + version);
}
int typeFlags = datainput.readUnsignedByte();
long len = readUInt32(datainput);
if (len != 9) {
// we only know about format with header of 9 bytes
throw new TikaException("Unpexpected FLV header length: " + len);
}
long sizePrev = readUInt32(datainput);
if (sizePrev != 0) {
// should be 0, perhaps this is not flv?
throw new TikaException("Unpexpected FLV first previous block size: " + sizePrev);
}
metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// flv tag stream follows...
while (true) {
int type = datainput.read();
if (type == -1) {
// EOF
break;
}
//body length
int datalen = readUInt24(datainput);
// timestamp
readUInt32(datainput);
// streamid
readUInt24(datainput);
if (type == TYPE_METADATA) {
// found metadata Tag, read content to buffer
byte[] metaBytes = new byte[datalen];
for (int readCount = 0; readCount < datalen; ) {
int r = stream.read(metaBytes, readCount, datalen - readCount);
if (r != -1) {
readCount += r;
} else {
break;
}
}
ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
DataInputStream dis = new DataInputStream(is);
Object data = null;
for (int i = 0; i < 2; i++) {
data = readAMFData(dis, -1);
}
if (data instanceof Map) {
// TODO if there are multiple metadata values with same key (in
// separate AMF blocks, we currently loose previous values)
Map<String, Object> extractedMetadata = (Map<String, Object>) data;
for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
if (entry.getValue() == null) {
continue;
}
metadata.set(entry.getKey(), entry.getValue().toString());
}
}
} else {
// Tag was not metadata, skip over data we cannot handle
for (int i = 0; i < datalen; i++) {
datainput.readByte();
}
}
// previous block size
sizePrev = readUInt32(datainput);
if (sizePrev != datalen + 11) {
// file was corrupt or we could not parse it...
break;
}
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class XMLParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class AudioParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// AudioSystem expects the stream to support the mark feature
if (!stream.markSupported()) {
stream = new BufferedInputStream(stream);
}
try {
AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
Type type = fileFormat.getType();
if (type == Type.AIFC || type == Type.AIFF) {
metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
} else if (type == Type.AU || type == Type.SND) {
metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
} else if (type == Type.WAVE) {
metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
}
AudioFormat audioFormat = fileFormat.getFormat();
int channels = audioFormat.getChannels();
if (channels != AudioSystem.NOT_SPECIFIED) {
metadata.set("channels", String.valueOf(channels));
// TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
}
float rate = audioFormat.getSampleRate();
if (rate != AudioSystem.NOT_SPECIFIED) {
metadata.set("samplerate", String.valueOf(rate));
metadata.set(XMPDM.AUDIO_SAMPLE_RATE, Integer.toString((int) rate));
}
int bits = audioFormat.getSampleSizeInBits();
if (bits != AudioSystem.NOT_SPECIFIED) {
metadata.set("bits", String.valueOf(bits));
if (bits == 8) {
metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
} else if (bits == 16) {
metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
} else if (bits == 32) {
metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
}
}
metadata.set("encoding", audioFormat.getEncoding().toString());
// Javadoc suggests that some of the following properties might
// be available, but I had no success in finding any:
// "duration" Long playback duration of the file in microseconds
// "author" String name of the author of this file
// "title" String title of this file
// "copyright" String copyright message
// "date" Date date of the recording or release
// "comment" String an arbitrary text
addMetadata(metadata, fileFormat.properties());
addMetadata(metadata, audioFormat.properties());
} catch (UnsupportedAudioFileException e) {
// There is no way to know whether this exception was
// caused by the document being corrupted or by the format
// just being unsupported. So we do nothing.
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
Aggregations