Search in sources :

Example 51 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project druid by druid-io.

the class Utils method openInputStream.

public static InputStream openInputStream(JobContext job, Path inputPath, final FileSystem fileSystem) throws IOException {
    if (!FileOutputFormat.getCompressOutput(job)) {
        return fileSystem.open(inputPath);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        inputPath = new Path(inputPath + codec.getDefaultExtension());
        return codec.createInputStream(fileSystem.open(inputPath));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 52 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project DataX by alibaba.

the class UnstructuredStorageReaderUtil method readFromStream.

public static void readFromStream(InputStream inputStream, String context, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
    String compress = readerSliceConfig.getString(Key.COMPRESS, null);
    if (StringUtils.isBlank(compress)) {
        compress = null;
    }
    String encoding = readerSliceConfig.getString(Key.ENCODING, Constant.DEFAULT_ENCODING);
    // handle blank encoding
    if (StringUtils.isBlank(encoding)) {
        encoding = Constant.DEFAULT_ENCODING;
        LOG.warn(String.format("您配置的encoding为[%s], 使用默认值[%s]", encoding, Constant.DEFAULT_ENCODING));
    }
    List<Configuration> column = readerSliceConfig.getListConfiguration(Key.COLUMN);
    // handle ["*"] -> [], null
    if (null != column && 1 == column.size() && "\"*\"".equals(column.get(0).toString())) {
        readerSliceConfig.set(Key.COLUMN, null);
        column = null;
    }
    BufferedReader reader = null;
    int bufferSize = readerSliceConfig.getInt(Key.BUFFER_SIZE, Constant.DEFAULT_BUFFER_SIZE);
    // compress logic
    try {
        if (null == compress) {
            reader = new BufferedReader(new InputStreamReader(inputStream, encoding), bufferSize);
        } else {
            // TODO compress
            if ("lzo_deflate".equalsIgnoreCase(compress)) {
                LzoInputStream lzoInputStream = new LzoInputStream(inputStream, new LzoDecompressor1x_safe());
                reader = new BufferedReader(new InputStreamReader(lzoInputStream, encoding));
            } else if ("lzo".equalsIgnoreCase(compress)) {
                LzoInputStream lzopInputStream = new ExpandLzopInputStream(inputStream);
                reader = new BufferedReader(new InputStreamReader(lzopInputStream, encoding));
            } else if ("gzip".equalsIgnoreCase(compress)) {
                CompressorInputStream compressorInputStream = new GzipCompressorInputStream(inputStream);
                reader = new BufferedReader(new InputStreamReader(compressorInputStream, encoding), bufferSize);
            } else if ("bzip2".equalsIgnoreCase(compress)) {
                CompressorInputStream compressorInputStream = new BZip2CompressorInputStream(inputStream);
                reader = new BufferedReader(new InputStreamReader(compressorInputStream, encoding), bufferSize);
            } else if ("hadoop-snappy".equalsIgnoreCase(compress)) {
                CompressionCodec snappyCodec = new SnappyCodec();
                InputStream snappyInputStream = snappyCodec.createInputStream(inputStream);
                reader = new BufferedReader(new InputStreamReader(snappyInputStream, encoding));
            } else if ("framing-snappy".equalsIgnoreCase(compress)) {
                InputStream snappyInputStream = new SnappyFramedInputStream(inputStream);
                reader = new BufferedReader(new InputStreamReader(snappyInputStream, encoding));
            } else /*else if ("xz".equalsIgnoreCase(compress)) {
					CompressorInputStream compressorInputStream = new XZCompressorInputStream(
							inputStream);
					reader = new BufferedReader(new InputStreamReader(
							compressorInputStream, encoding));
				} else if ("ar".equalsIgnoreCase(compress)) {
					ArArchiveInputStream arArchiveInputStream = new ArArchiveInputStream(
							inputStream);
					reader = new BufferedReader(new InputStreamReader(
							arArchiveInputStream, encoding));
				} else if ("arj".equalsIgnoreCase(compress)) {
					ArjArchiveInputStream arjArchiveInputStream = new ArjArchiveInputStream(
							inputStream);
					reader = new BufferedReader(new InputStreamReader(
							arjArchiveInputStream, encoding));
				} else if ("cpio".equalsIgnoreCase(compress)) {
					CpioArchiveInputStream cpioArchiveInputStream = new CpioArchiveInputStream(
							inputStream);
					reader = new BufferedReader(new InputStreamReader(
							cpioArchiveInputStream, encoding));
				} else if ("dump".equalsIgnoreCase(compress)) {
					DumpArchiveInputStream dumpArchiveInputStream = new DumpArchiveInputStream(
							inputStream);
					reader = new BufferedReader(new InputStreamReader(
							dumpArchiveInputStream, encoding));
				} else if ("jar".equalsIgnoreCase(compress)) {
					JarArchiveInputStream jarArchiveInputStream = new JarArchiveInputStream(
							inputStream);
					reader = new BufferedReader(new InputStreamReader(
							jarArchiveInputStream, encoding));
				} else if ("tar".equalsIgnoreCase(compress)) {
					TarArchiveInputStream tarArchiveInputStream = new TarArchiveInputStream(
							inputStream);
					reader = new BufferedReader(new InputStreamReader(
							tarArchiveInputStream, encoding));
				}*/
            if ("zip".equalsIgnoreCase(compress)) {
                ZipCycleInputStream zipCycleInputStream = new ZipCycleInputStream(inputStream);
                reader = new BufferedReader(new InputStreamReader(zipCycleInputStream, encoding), bufferSize);
            } else {
                throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.ILLEGAL_VALUE, String.format("仅支持 gzip, bzip2, zip, lzo, lzo_deflate, hadoop-snappy, framing-snappy" + "文件压缩格式 , 不支持您配置的文件压缩格式: [%s]", compress));
            }
        }
        UnstructuredStorageReaderUtil.doReadFromStream(reader, context, readerSliceConfig, recordSender, taskPluginCollector);
    } catch (UnsupportedEncodingException uee) {
        throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.OPEN_FILE_WITH_CHARSET_ERROR, String.format("不支持的编码格式 : [%s]", encoding), uee);
    } catch (NullPointerException e) {
        throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.RUNTIME_EXCEPTION, "运行时错误, 请联系我们", e);
    }/* catch (ArchiveException e) {
			throw DataXException.asDataXException(
					UnstructuredStorageReaderErrorCode.READ_FILE_IO_ERROR,
					String.format("压缩文件流读取错误 : [%s]", context), e);
		} */
     catch (IOException e) {
        throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.READ_FILE_IO_ERROR, String.format("流读取错误 : [%s]", context), e);
    } finally {
        IOUtils.closeQuietly(reader);
    }
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) Configuration(com.alibaba.datax.common.util.Configuration) CompressorInputStream(org.apache.commons.compress.compressors.CompressorInputStream) SnappyFramedInputStream(io.airlift.compress.snappy.SnappyFramedInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) SnappyFramedInputStream(io.airlift.compress.snappy.SnappyFramedInputStream) CompressorInputStream(org.apache.commons.compress.compressors.CompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) SnappyCodec(io.airlift.compress.snappy.SnappyCodec)

Example 53 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hbase by apache.

the class TestCompressionTest method nativeCodecTest.

/**
 * Verify CompressionTest.testCompression() on a native codec.
 */
private void nativeCodecTest(String codecName, String libName, String codecClassName) {
    if (isCompressionAvailable(codecClassName)) {
        try {
            if (libName != null) {
                System.loadLibrary(libName);
            }
            try {
                Configuration conf = new Configuration();
                CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(conf.getClassByName(codecClassName), conf);
                DataOutputBuffer compressedDataBuffer = new DataOutputBuffer();
                CompressionOutputStream deflateFilter = codec.createOutputStream(compressedDataBuffer);
                byte[] data = new byte[1024];
                DataOutputStream deflateOut = new DataOutputStream(new BufferedOutputStream(deflateFilter));
                deflateOut.write(data, 0, data.length);
                deflateOut.flush();
                deflateFilter.finish();
                // Codec class, codec nativelib and Hadoop nativelib with codec JNIs are present
                assertTrue(CompressionTest.testCompression(codecName));
            } catch (UnsatisfiedLinkError e) {
                // Hadoop nativelib does not have codec JNIs.
                // cannot assert the codec here because the current logic of
                // CompressionTest checks only classloading, not the codec
                // usage.
                LOG.debug("No JNI for codec '" + codecName + "' " + e.getMessage());
            } catch (Exception e) {
                LOG.error(codecName, e);
            }
        } catch (UnsatisfiedLinkError e) {
            // nativelib is not available
            LOG.debug("Native lib not available: " + codecName);
            assertFalse(CompressionTest.testCompression(codecName));
        }
    } else {
        // Compression Codec class is not available
        LOG.debug("Codec class not available: " + codecName);
        assertFalse(CompressionTest.testCompression(codecName));
    }
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Configuration(org.apache.hadoop.conf.Configuration) DataOutputStream(java.io.DataOutputStream) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) BufferedOutputStream(java.io.BufferedOutputStream) IOException(java.io.IOException)

Example 54 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hive by apache.

the class Utilities method getFileExtension.

/**
 * Based on compression option, output format, and configured output codec -
 * get extension for output file. Text files require an extension, whereas
 * others, like sequence files, do not.
 * <p>
 * The property <code>hive.output.file.extension</code> is used to determine
 * the extension - if set, it will override other logic for choosing an
 * extension.
 *
 * @param jc
 *          Job Configuration
 * @param isCompressed
 *          Whether the output file is compressed or not
 * @param hiveOutputFormat
 *          The output format, used to detect if the format is text
 * @return the required file extension (example: .gz)
 */
public static String getFileExtension(JobConf jc, boolean isCompressed, HiveOutputFormat<?, ?> hiveOutputFormat) {
    String extension = HiveConf.getVar(jc, HiveConf.ConfVars.OUTPUT_FILE_EXTENSION);
    if (!StringUtils.isEmpty(extension)) {
        return extension;
    }
    if ((hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) && isCompressed) {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
        CompressionCodec codec = ReflectionUtil.newInstance(codecClass, jc);
        return codec.getDefaultExtension();
    }
    return StringUtils.EMPTY;
}
Also used : CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) HiveIgnoreKeyTextOutputFormat(org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat)

Example 55 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hive by apache.

the class Utilities method createCompressedStream.

/**
 * Convert an output stream to a compressed output stream based on codecs codecs in the Job
 * Configuration. Caller specifies directly whether file is compressed or not
 *
 * @param jc
 *          Job Configuration
 * @param out
 *          Output Stream to be converted into compressed output stream
 * @param isCompressed
 *          whether the output stream needs to be compressed or not
 * @return compressed output stream
 */
public static OutputStream createCompressedStream(JobConf jc, OutputStream out, boolean isCompressed) throws IOException {
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jc, DefaultCodec.class);
        CompressionCodec codec = ReflectionUtil.newInstance(codecClass, jc);
        return codec.createOutputStream(out);
    } else {
        return (out);
    }
}
Also used : CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)111 Path (org.apache.hadoop.fs.Path)54 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)38 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)37 InputStream (java.io.InputStream)18 IOException (java.io.IOException)17 Test (org.junit.Test)17 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)15 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 ByteString (com.google.protobuf.ByteString)5 DataInputStream (java.io.DataInputStream)5