Search in sources :

Example 16 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class TikaEncodingDetectorTest method testEncodingDetectorConfigurability.

@Test
public void testEncodingDetectorConfigurability() throws Exception {
    TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml"));
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    try {
        Metadata metadata = getXML("english.cp500.txt", p).metadata;
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
    Tika tika = new Tika(tikaConfig);
    try {
        String txt = tika.parseToString(getResourceAsFile("/test-documents/english.cp500.txt"));
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Tika(org.apache.tika.Tika) Test(org.junit.Test)

Example 17 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class FLVParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    DataInputStream datainput = new DataInputStream(stream);
    if (!checkSignature(datainput)) {
        throw new TikaException("FLV signature not detected");
    }
    // header
    int version = datainput.readUnsignedByte();
    if (version != 1) {
        // should be 1, perhaps this is not flv?
        throw new TikaException("Unpexpected FLV version: " + version);
    }
    int typeFlags = datainput.readUnsignedByte();
    long len = readUInt32(datainput);
    if (len != 9) {
        // we only know about format with header of 9 bytes
        throw new TikaException("Unpexpected FLV header length: " + len);
    }
    long sizePrev = readUInt32(datainput);
    if (sizePrev != 0) {
        // should be 0, perhaps this is not flv?
        throw new TikaException("Unpexpected FLV first previous block size: " + sizePrev);
    }
    metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
    metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
    metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    // flv tag stream follows...
    while (true) {
        int type = datainput.read();
        if (type == -1) {
            // EOF
            break;
        }
        //body length
        int datalen = readUInt24(datainput);
        // timestamp
        readUInt32(datainput);
        // streamid
        readUInt24(datainput);
        if (type == TYPE_METADATA) {
            // found metadata Tag, read content to buffer
            byte[] metaBytes = new byte[datalen];
            for (int readCount = 0; readCount < datalen; ) {
                int r = stream.read(metaBytes, readCount, datalen - readCount);
                if (r != -1) {
                    readCount += r;
                } else {
                    break;
                }
            }
            ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
            DataInputStream dis = new DataInputStream(is);
            Object data = null;
            for (int i = 0; i < 2; i++) {
                data = readAMFData(dis, -1);
            }
            if (data instanceof Map) {
                // TODO if there are multiple metadata values with same key (in
                // separate AMF blocks, we currently loose previous values)
                Map<String, Object> extractedMetadata = (Map<String, Object>) data;
                for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
                    if (entry.getValue() == null) {
                        continue;
                    }
                    metadata.set(entry.getKey(), entry.getValue().toString());
                }
            }
        } else {
            // Tag was not metadata, skip over data we cannot handle
            for (int i = 0; i < datalen; i++) {
                datainput.readByte();
            }
        }
        // previous block size
        sizePrev = readUInt32(datainput);
        if (sizePrev != datalen + 11) {
            // file was corrupt or we could not parse it...
            break;
        }
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) ByteArrayInputStream(java.io.ByteArrayInputStream) DataInputStream(java.io.DataInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) HashMap(java.util.HashMap) Map(java.util.Map)

Example 18 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class XMLParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    if (metadata.get(Metadata.CONTENT_TYPE) == null) {
        metadata.set(Metadata.CONTENT_TYPE, "application/xml");
    }
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");
    TaggedContentHandler tagged = new TaggedContentHandler(handler);
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endElement("p");
        xhtml.endDocument();
    }
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) TaggedContentHandler(org.apache.tika.sax.TaggedContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) SAXException(org.xml.sax.SAXException)

Example 19 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class EncryptedPrescriptionParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try {
        Key key = Pharmacy.getKey();
        Cipher cipher = Cipher.getInstance("RSA");
        cipher.init(Cipher.DECRYPT_MODE, key);
        InputStream decrypted = new CipherInputStream(stream, cipher);
        new PrescriptionParser().parse(decrypted, handler, metadata, context);
    } catch (GeneralSecurityException e) {
        throw new TikaException("Unable to decrypt a digital prescription", e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) CipherInputStream(javax.crypto.CipherInputStream) CipherInputStream(javax.crypto.CipherInputStream) InputStream(java.io.InputStream) GeneralSecurityException(java.security.GeneralSecurityException) Cipher(javax.crypto.Cipher) Key(java.security.Key)

Example 20 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class TestChmExtraction method testExtractChmEntry.

protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException {
    ChmExtractor chmExtractor = new ChmExtractor(stream);
    ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
    final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
    Set<String> names = new HashSet<String>();
    for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
        byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
        //Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
        if (!niceAscFileName(directoryListingEntry.getName())) {
            throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
        }
        final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
        //check duplicate entry name which is seen before.
        if (names.contains(lowName)) {
            throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
        }
        names.add(lowName);
        if (lowName.endsWith(".html") || lowName.endsWith(".htm") || lowName.endsWith(".hhk") || lowName.endsWith(".hhc")) //|| name.endsWith(".bmp")
        {
            if (findZero(data)) {
                throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
            }
            //validate html
            String html = new String(data, ISO_8859_1);
            if (!htmlPairP.matcher(html).find()) {
                System.err.println(lowName + " is invalid.");
                System.err.println(html);
                throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
            }
        //                else {
        //                    System.err.println(directoryListingEntry.getName() + " is valid.");
        //                }
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) TikaException(org.apache.tika.exception.TikaException) ChmDirectoryListingSet(org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet) ChmExtractor(org.apache.tika.parser.chm.core.ChmExtractor) HashSet(java.util.HashSet) DirectoryListingEntry(org.apache.tika.parser.chm.accessor.DirectoryListingEntry)

Aggregations

TikaException (org.apache.tika.exception.TikaException)142 IOException (java.io.IOException)54 SAXException (org.xml.sax.SAXException)42 InputStream (java.io.InputStream)37 TikaInputStream (org.apache.tika.io.TikaInputStream)33 Metadata (org.apache.tika.metadata.Metadata)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 Test (org.junit.Test)19 ParseContext (org.apache.tika.parser.ParseContext)18 ContentHandler (org.xml.sax.ContentHandler)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)13 Parser (org.apache.tika.parser.Parser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8