Search in sources :

Example 1 with ZCompressorInputStream

use of org.apache.commons.compress.compressors.z.ZCompressorInputStream in project tika by apache.

the class ExtractReader method loadExtract.

public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {
    List<Metadata> metadataList = null;
    if (extractFile == null || !Files.isRegularFile(extractFile)) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
    }
    FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
    if (fileSuffixes.txtOrJson == null) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
    }
    if (!Files.isRegularFile(extractFile)) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
    }
    long length = -1L;
    try {
        length = Files.size(extractFile);
    } catch (IOException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
    }
    if (length == 0L) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
    }
    if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
    }
    if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
    }
    Reader reader = null;
    InputStream is = null;
    try {
        is = Files.newInputStream(extractFile);
        if (fileSuffixes.compression != null) {
            if (fileSuffixes.compression.equals("bz2")) {
                is = new BZip2CompressorInputStream(is);
            } else if (fileSuffixes.compression.equals("gz") || fileSuffixes.compression.equals("gzip")) {
                is = new GzipCompressorInputStream(is);
            } else if (fileSuffixes.compression.equals("zip")) {
                is = new ZCompressorInputStream(is);
            } else {
                LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
                return metadataList;
            }
        }
        reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
    } catch (IOException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
    }
    try {
        if (fileSuffixes.txtOrJson.equals("json")) {
            metadataList = JsonMetadataList.fromJson(reader);
            if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
                while (metadataList.size() > 1) {
                    metadataList.remove(metadataList.size() - 1);
                }
            } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && metadataList.size() > 1) {
                StringBuilder sb = new StringBuilder();
                Metadata containerMetadata = metadataList.get(0);
                for (int i = 0; i < metadataList.size(); i++) {
                    Metadata m = metadataList.get(i);
                    String c = m.get(RecursiveParserWrapper.TIKA_CONTENT);
                    if (c != null) {
                        sb.append(c);
                        sb.append(" ");
                    }
                }
                containerMetadata.set(RecursiveParserWrapper.TIKA_CONTENT, sb.toString());
                while (metadataList.size() > 1) {
                    metadataList.remove(metadataList.size() - 1);
                }
            }
        } else {
            metadataList = generateListFromTextFile(reader, fileSuffixes);
        }
    } catch (IOException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
    } catch (TikaException e) {
        throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_PARSE_EXCEPTION);
    } finally {
        IOUtils.closeQuietly(reader);
        IOUtils.closeQuietly(is);
    }
    return metadataList;
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) TikaException(org.apache.tika.exception.TikaException) InputStreamReader(java.io.InputStreamReader) ZCompressorInputStream(org.apache.commons.compress.compressors.z.ZCompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) BufferedReader(java.io.BufferedReader) ZCompressorInputStream(org.apache.commons.compress.compressors.z.ZCompressorInputStream)

Aggregations

BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 Reader (java.io.Reader)1 BZip2CompressorInputStream (org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream)1 GzipCompressorInputStream (org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream)1 ZCompressorInputStream (org.apache.commons.compress.compressors.z.ZCompressorInputStream)1 TikaException (org.apache.tika.exception.TikaException)1 Metadata (org.apache.tika.metadata.Metadata)1