Search in sources :

Example 6 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class ObjectRecognitionParser method parse.

@Override
public synchronized void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    if (!recogniser.isAvailable()) {
        LOG.warn("{} is not available for service", recogniser.getClass());
        return;
    }
    metadata.set(MD_REC_IMPL_KEY, recogniser.getClass().getName());
    long start = System.currentTimeMillis();
    List<RecognisedObject> objects = recogniser.recognise(stream, handler, metadata, context);
    LOG.debug("Found {} objects", objects != null ? objects.size() : 0);
    LOG.debug("Time taken {}ms", System.currentTimeMillis() - start);
    if (objects != null && !objects.isEmpty()) {
        Collections.sort(objects, DESC_CONFIDENCE_SORTER);
        int count = 0;
        List<RecognisedObject> acceptedObjects = new ArrayList<RecognisedObject>(topN);
        // first process all the MD objects
        for (RecognisedObject object : objects) {
            if (object.getConfidence() >= minConfidence) {
                if (object.getConfidence() >= minConfidence) {
                    count++;
                    LOG.debug("Add {}", object);
                    String mdValue = String.format(Locale.ENGLISH, "%s (%.5f)", object.getLabel(), object.getConfidence());
                    metadata.add(MD_KEY, mdValue);
                    acceptedObjects.add(object);
                    if (count >= topN) {
                        break;
                    }
                } else {
                    LOG.warn("Object {} confidence {} less than min {}", object, object.getConfidence(), minConfidence);
                }
            }
        }
        // now the handler
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.startElement("ol", "id", "objects");
        count = 0;
        for (RecognisedObject object : acceptedObjects) {
            //writing to handler
            xhtml.startElement("li", "id", object.getId());
            String text = String.format(Locale.ENGLISH, " %s [%s](confidence = %f )", object.getLabel(), object.getLabelLang(), object.getConfidence());
            xhtml.characters(text);
            xhtml.endElement("li");
        }
        xhtml.endElement("ol");
        xhtml.endDocument();
    } else {
        LOG.warn("NO objects");
        metadata.add("no.objects", Boolean.TRUE.toString());
    }
}
Also used : ArrayList(java.util.ArrayList) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 7 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class StringsParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    StringsConfig stringsConfig = context.get(StringsConfig.class, DEFAULT_STRINGS_CONFIG);
    FileConfig fileConfig = context.get(FileConfig.class, DEFAULT_FILE_CONFIG);
    if (!hasStrings(stringsConfig)) {
        return;
    }
    TikaInputStream tis = TikaInputStream.get(stream);
    File input = tis.getFile();
    // Metadata
    metadata.set("strings:min-len", "" + stringsConfig.getMinLength());
    metadata.set("strings:encoding", stringsConfig.toString());
    metadata.set("strings:file_output", doFile(input, fileConfig));
    int totalBytes = 0;
    // Content
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    totalBytes = doStrings(input, stringsConfig, xhtml);
    xhtml.endDocument();
    // Metadata
    metadata.set("strings:length", "" + totalBytes);
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File)

Example 8 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class FLVParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    DataInputStream datainput = new DataInputStream(stream);
    if (!checkSignature(datainput)) {
        throw new TikaException("FLV signature not detected");
    }
    // header
    int version = datainput.readUnsignedByte();
    if (version != 1) {
        // should be 1, perhaps this is not flv?
        throw new TikaException("Unpexpected FLV version: " + version);
    }
    int typeFlags = datainput.readUnsignedByte();
    long len = readUInt32(datainput);
    if (len != 9) {
        // we only know about format with header of 9 bytes
        throw new TikaException("Unpexpected FLV header length: " + len);
    }
    long sizePrev = readUInt32(datainput);
    if (sizePrev != 0) {
        // should be 0, perhaps this is not flv?
        throw new TikaException("Unpexpected FLV first previous block size: " + sizePrev);
    }
    metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
    metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
    metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    // flv tag stream follows...
    while (true) {
        int type = datainput.read();
        if (type == -1) {
            // EOF
            break;
        }
        //body length
        int datalen = readUInt24(datainput);
        // timestamp
        readUInt32(datainput);
        // streamid
        readUInt24(datainput);
        if (type == TYPE_METADATA) {
            // found metadata Tag, read content to buffer
            byte[] metaBytes = new byte[datalen];
            for (int readCount = 0; readCount < datalen; ) {
                int r = stream.read(metaBytes, readCount, datalen - readCount);
                if (r != -1) {
                    readCount += r;
                } else {
                    break;
                }
            }
            ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
            DataInputStream dis = new DataInputStream(is);
            Object data = null;
            for (int i = 0; i < 2; i++) {
                data = readAMFData(dis, -1);
            }
            if (data instanceof Map) {
                // TODO if there are multiple metadata values with same key (in
                // separate AMF blocks, we currently loose previous values)
                Map<String, Object> extractedMetadata = (Map<String, Object>) data;
                for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
                    if (entry.getValue() == null) {
                        continue;
                    }
                    metadata.set(entry.getKey(), entry.getValue().toString());
                }
            }
        } else {
            // Tag was not metadata, skip over data we cannot handle
            for (int i = 0; i < datalen; i++) {
                datainput.readByte();
            }
        }
        // previous block size
        sizePrev = readUInt32(datainput);
        if (sizePrev != datalen + 11) {
            // file was corrupt or we could not parse it...
            break;
        }
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) ByteArrayInputStream(java.io.ByteArrayInputStream) DataInputStream(java.io.DataInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) HashMap(java.util.HashMap) Map(java.util.Map)

Example 9 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class XMLParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    if (metadata.get(Metadata.CONTENT_TYPE) == null) {
        metadata.set(Metadata.CONTENT_TYPE, "application/xml");
    }
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");
    TaggedContentHandler tagged = new TaggedContentHandler(handler);
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endElement("p");
        xhtml.endDocument();
    }
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) TaggedContentHandler(org.apache.tika.sax.TaggedContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) SAXException(org.xml.sax.SAXException)

Example 10 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class AudioParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // AudioSystem expects the stream to support the mark feature
    if (!stream.markSupported()) {
        stream = new BufferedInputStream(stream);
    }
    try {
        AudioFileFormat fileFormat = AudioSystem.getAudioFileFormat(stream);
        Type type = fileFormat.getType();
        if (type == Type.AIFC || type == Type.AIFF) {
            metadata.set(Metadata.CONTENT_TYPE, "audio/x-aiff");
        } else if (type == Type.AU || type == Type.SND) {
            metadata.set(Metadata.CONTENT_TYPE, "audio/basic");
        } else if (type == Type.WAVE) {
            metadata.set(Metadata.CONTENT_TYPE, "audio/x-wav");
        }
        AudioFormat audioFormat = fileFormat.getFormat();
        int channels = audioFormat.getChannels();
        if (channels != AudioSystem.NOT_SPECIFIED) {
            metadata.set("channels", String.valueOf(channels));
        // TODO: Use XMPDM.TRACKS? (see also frame rate in AudioFormat)
        }
        float rate = audioFormat.getSampleRate();
        if (rate != AudioSystem.NOT_SPECIFIED) {
            metadata.set("samplerate", String.valueOf(rate));
            metadata.set(XMPDM.AUDIO_SAMPLE_RATE, Integer.toString((int) rate));
        }
        int bits = audioFormat.getSampleSizeInBits();
        if (bits != AudioSystem.NOT_SPECIFIED) {
            metadata.set("bits", String.valueOf(bits));
            if (bits == 8) {
                metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "8Int");
            } else if (bits == 16) {
                metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "16Int");
            } else if (bits == 32) {
                metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, "32Int");
            }
        }
        metadata.set("encoding", audioFormat.getEncoding().toString());
        // Javadoc suggests that some of the following properties might
        // be available, but I had no success in finding any:
        // "duration" Long playback duration of the file in microseconds
        // "author" String name of the author of this file
        // "title" String title of this file
        // "copyright" String copyright message
        // "date" Date date of the recording or release
        // "comment" String an arbitrary text
        addMetadata(metadata, fileFormat.properties());
        addMetadata(metadata, audioFormat.properties());
    } catch (UnsupportedAudioFileException e) {
    // There is no way to know whether this exception was
    // caused by the document being corrupted or by the format
    // just being unsupported. So we do nothing.
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : UnsupportedAudioFileException(javax.sound.sampled.UnsupportedAudioFileException) MediaType(org.apache.tika.mime.MediaType) Type(javax.sound.sampled.AudioFileFormat.Type) BufferedInputStream(java.io.BufferedInputStream) AudioFormat(javax.sound.sampled.AudioFormat) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) AudioFileFormat(javax.sound.sampled.AudioFileFormat)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3