Search in sources :

Example 61 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ChmItspHeader method unmarshalUInt32.

private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException {
    ChmAssert.assertByteArrayNotNull(data);
    if (4 > dataLenght)
        throw new TikaException("4 > dataLenght");
    dest = (data[this.getCurrentPlace()] & 0xff) | (data[this.getCurrentPlace() + 1] & 0xff) << 8 | (data[this.getCurrentPlace() + 2] & 0xff) << 16 | (data[this.getCurrentPlace() + 3] & 0xff) << 24;
    setDataRemained(this.getDataRemained() - 4);
    this.setCurrentPlace(this.getCurrentPlace() + 4);
    return dest;
}
Also used : TikaException(org.apache.tika.exception.TikaException)

Example 62 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ChmItsfHeader method unmarshalUint64.

/**
     * Takes 8 bytes and reverses them
     * 
     * @param data
     * @param dest
     * @return
     * @throws TikaException 
     */
private long unmarshalUint64(byte[] data, long dest) throws TikaException {
    byte[] temp = new byte[8];
    int i, j;
    if (8 > this.getDataRemained())
        throw new TikaException("8 > this.getDataRemained()");
    for (i = 8, j = 7; i > 0; i--) {
        temp[j--] = data[this.getCurrentPlace()];
        this.setCurrentPlace(this.getCurrentPlace() + 1);
    }
    dest = new BigInteger(temp).longValue();
    this.setDataRemained(this.getDataRemained() - 8);
    return dest;
}
Also used : TikaException(org.apache.tika.exception.TikaException) BigInteger(java.math.BigInteger)

Example 63 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ChmItsfHeader method parse.

// @Override
public void parse(byte[] data, ChmItsfHeader chmItsfHeader) throws TikaException {
    if (data.length < ChmConstants.CHM_ITSF_V2_LEN || data.length > ChmConstants.CHM_ITSF_V3_LEN)
        throw new TikaException("we only know how to deal with the 0x58 and 0x60 byte structures");
    chmItsfHeader.setDataRemained(data.length);
    chmItsfHeader.unmarshalCharArray(data, chmItsfHeader, ChmConstants.CHM_SIGNATURE_LEN);
    chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getVersion()));
    chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getHeaderLen()));
    chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data, chmItsfHeader.getUnknown_000c()));
    chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLastModified()));
    chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data, chmItsfHeader.getLangId()));
    chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getDir_uuid(), 16));
    chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data, chmItsfHeader.getStream_uuid(), 16));
    chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownOffset()));
    chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getUnknownLen()));
    chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirOffset()));
    chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data, chmItsfHeader.getDirLen()));
    if (!new String(chmItsfHeader.getSignature(), UTF_8).equals(ChmConstants.ITSF))
        throw new TikaException("seems not valid file");
    if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
        if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
            throw new TikaException("something wrong with header");
    } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
        if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
            throw new TikaException("unknown v3 header lenght");
    } else
        throw new ChmParsingException("unsupported chm format");
    /*
         * now, if we have a V3 structure, unmarshal the rest, otherwise,
         * compute it
         */
    if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
        if (chmItsfHeader.getDataRemained() >= 0)
            chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset() + chmItsfHeader.getDirLen());
        else
            throw new TikaException("cannot set data offset, no data remained");
    } else
        chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset() + chmItsfHeader.getDirLen());
}
Also used : ChmParsingException(org.apache.tika.parser.chm.exception.ChmParsingException) TikaException(org.apache.tika.exception.TikaException)

Example 64 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class JsonMetadataTest method testDeserializationException.

@Test
public void testDeserializationException() {
    //malformed json; 500,000 should be in quotes
    String json = "{\"k1\":[\"v1\",\"v2\"],\"k3\":\"v3\",\"k4\":500,000}";
    boolean ex = false;
    try {
        Metadata deserialized = JsonMetadata.fromJson(new StringReader(json));
    } catch (TikaException e) {
        ex = true;
    }
    assertTrue(ex);
}
Also used : TikaException(org.apache.tika.exception.TikaException) Metadata(org.apache.tika.metadata.Metadata) StringReader(java.io.StringReader) Test(org.junit.Test)

Example 65 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ExtractReader method loadExtract.

public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {
    List<Metadata> metadataList = null;
    if (extractFile == null || !Files.isRegularFile(extractFile)) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
    }
    FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
    if (fileSuffixes.txtOrJson == null) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
    }
    if (!Files.isRegularFile(extractFile)) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
    }
    long length = -1L;
    try {
        length = Files.size(extractFile);
    } catch (IOException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
    }
    if (length == 0L) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
    }
    if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
    }
    if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
    }
    Reader reader = null;
    InputStream is = null;
    try {
        is = Files.newInputStream(extractFile);
        if (fileSuffixes.compression != null) {
            if (fileSuffixes.compression.equals("bz2")) {
                is = new BZip2CompressorInputStream(is);
            } else if (fileSuffixes.compression.equals("gz") || fileSuffixes.compression.equals("gzip")) {
                is = new GzipCompressorInputStream(is);
            } else if (fileSuffixes.compression.equals("zip")) {
                is = new ZCompressorInputStream(is);
            } else {
                LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
                return metadataList;
            }
        }
        reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
    } catch (IOException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
    }
    try {
        if (fileSuffixes.txtOrJson.equals("json")) {
            metadataList = JsonMetadataList.fromJson(reader);
            if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
                while (metadataList.size() > 1) {
                    metadataList.remove(metadataList.size() - 1);
                }
            } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && metadataList.size() > 1) {
                StringBuilder sb = new StringBuilder();
                Metadata containerMetadata = metadataList.get(0);
                for (int i = 0; i < metadataList.size(); i++) {
                    Metadata m = metadataList.get(i);
                    String c = m.get(RecursiveParserWrapper.TIKA_CONTENT);
                    if (c != null) {
                        sb.append(c);
                        sb.append(" ");
                    }
                }
                containerMetadata.set(RecursiveParserWrapper.TIKA_CONTENT, sb.toString());
                while (metadataList.size() > 1) {
                    metadataList.remove(metadataList.size() - 1);
                }
            }
        } else {
            metadataList = generateListFromTextFile(reader, fileSuffixes);
        }
    } catch (IOException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
    } catch (TikaException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_PARSE_EXCEPTION);
    } finally {
        IOUtils.closeQuietly(reader);
        IOUtils.closeQuietly(is);
    }
    return metadataList;
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) TikaException(org.apache.tika.exception.TikaException) InputStreamReader(java.io.InputStreamReader) ZCompressorInputStream(org.apache.commons.compress.compressors.z.ZCompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) BufferedReader(java.io.BufferedReader) ZCompressorInputStream(org.apache.commons.compress.compressors.z.ZCompressorInputStream)

Aggregations

TikaException (org.apache.tika.exception.TikaException)144 IOException (java.io.IOException)56 SAXException (org.xml.sax.SAXException)44 InputStream (java.io.InputStream)37 Metadata (org.apache.tika.metadata.Metadata)35 TikaInputStream (org.apache.tika.io.TikaInputStream)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 ParseContext (org.apache.tika.parser.ParseContext)19 Test (org.junit.Test)19 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)17 ContentHandler (org.xml.sax.ContentHandler)17 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)14 Parser (org.apache.tika.parser.Parser)14 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8