Search in sources :

Example 46 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project presto by prestodb.

the class TestOrcBatchPageSourceMemoryTracking method createTestFile.

public static FileSplit createTestFile(String filePath, HiveOutputFormat<?, ?> outputFormat, Serializer serializer, String compressionCodec, List<TestColumn> testColumns, int numRows, int stripeRows) throws Exception {
    // filter out partition keys, which are not written to the file
    testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
    tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
    serializer.initialize(CONFIGURATION, tableProperties);
    JobConf jobConf = new JobConf();
    if (compressionCodec != null) {
        CompressionCodec codec = new CompressionCodecFactory(CONFIGURATION).getCodecByName(compressionCodec);
        jobConf.set(COMPRESS_CODEC, codec.getClass().getName());
        jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString());
    }
    RecordWriter recordWriter = createRecordWriter(new Path(filePath), CONFIGURATION);
    try {
        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector)));
        Object row = objectInspector.create();
        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
        for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
            for (int i = 0; i < testColumns.size(); i++) {
                Object writeValue = testColumns.get(i).getWriteValue();
                if (writeValue instanceof Slice) {
                    writeValue = ((Slice) writeValue).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), writeValue);
            }
            Writable record = serializer.serialize(row, objectInspector);
            recordWriter.write(record);
            if (rowNumber % stripeRows == stripeRows - 1) {
                flushStripe(recordWriter);
            }
        }
    } finally {
        recordWriter.close(false);
    }
    Path path = new Path(filePath);
    path.getFileSystem(CONFIGURATION).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}
Also used : Path(org.apache.hadoop.fs.Path) Writable(org.apache.hadoop.io.Writable) Properties(java.util.Properties) FileSplit(org.apache.hadoop.mapred.FileSplit) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) Slice(io.airlift.slice.Slice) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile)

Example 47 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project flink by apache.

the class HiveWriterFactory method createRecordWriter.

/**
 * Create a {@link RecordWriter} from path.
 */
public RecordWriter createRecordWriter(Path path) {
    try {
        checkInitialize();
        JobConf conf = new JobConf(confWrapper.conf());
        if (isCompressed) {
            String codecStr = conf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC.varname);
            if (!StringUtils.isNullOrWhitespaceOnly(codecStr)) {
                // noinspection unchecked
                Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class.forName(codecStr, true, Thread.currentThread().getContextClassLoader());
                FileOutputFormat.setOutputCompressorClass(conf, codec);
            }
            String typeStr = conf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE.varname);
            if (!StringUtils.isNullOrWhitespaceOnly(typeStr)) {
                SequenceFile.CompressionType style = SequenceFile.CompressionType.valueOf(typeStr);
                SequenceFileOutputFormat.setOutputCompressionType(conf, style);
            }
        }
        return hiveShim.getHiveRecordWriter(conf, hiveOutputFormatClz, recordSerDe.getSerializedClass(), isCompressed, tableProperties, path);
    } catch (Exception e) {
        throw new FlinkHiveException(e);
    }
}
Also used : SequenceFile(org.apache.hadoop.io.SequenceFile) FlinkHiveException(org.apache.flink.connectors.hive.FlinkHiveException) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) FlinkHiveException(org.apache.flink.connectors.hive.FlinkHiveException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 48 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project flink by apache.

the class SequenceFileWriterFactory method getCompressionCodec.

private CompressionCodec getCompressionCodec(Configuration conf, String compressionCodecName) {
    checkNotNull(conf);
    checkNotNull(compressionCodecName);
    if (compressionCodecName.equals(NO_COMPRESSION)) {
        return null;
    }
    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodecByName(compressionCodecName);
    if (codec == null) {
        throw new RuntimeException("Codec " + compressionCodecName + " not found.");
    }
    return codec;
}
Also used : CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 49 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hbase by apache.

the class CellBlockBuilder method encodeCellsTo.

private void encodeCellsTo(OutputStream os, CellScanner cellScanner, Codec codec, CompressionCodec compressor) throws IOException {
    Compressor poolCompressor = null;
    try {
        if (compressor != null) {
            if (compressor instanceof Configurable) {
                ((Configurable) compressor).setConf(this.conf);
            }
            poolCompressor = CodecPool.getCompressor(compressor);
            os = compressor.createOutputStream(os, poolCompressor);
        }
        Codec.Encoder encoder = codec.getEncoder(os);
        while (cellScanner.advance()) {
            encoder.write(cellScanner.current());
        }
        encoder.flush();
    } catch (BufferOverflowException | IndexOutOfBoundsException e) {
        throw new DoNotRetryIOException(e);
    } finally {
        os.close();
        if (poolCompressor != null) {
            CodecPool.returnCompressor(poolCompressor);
        }
    }
}
Also used : Codec(org.apache.hadoop.hbase.codec.Codec) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) DoNotRetryIOException(org.apache.hadoop.hbase.DoNotRetryIOException) Compressor(org.apache.hadoop.io.compress.Compressor) Configurable(org.apache.hadoop.conf.Configurable) BufferOverflowException(java.nio.BufferOverflowException)

Example 50 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hbase by apache.

the class Compression method buildCodec.

/**
 * Load a codec implementation for an algorithm using the supplied configuration.
 * @param conf the configuration to use
 * @param algo the algorithm to implement
 */
private static CompressionCodec buildCodec(final Configuration conf, final Algorithm algo) {
    try {
        String codecClassName = conf.get(algo.confKey, algo.confDefault);
        if (codecClassName == null) {
            throw new RuntimeException("No codec configured for " + algo.confKey);
        }
        Class<?> codecClass = getClassLoaderForCodec().loadClass(codecClassName);
        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, new Configuration(conf));
        LOG.info("Loaded codec {} for compression algorithm {}", codec.getClass().getCanonicalName(), algo.name());
        return codec;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    }
}
Also used : HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) Configuration(org.apache.hadoop.conf.Configuration) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)111 Path (org.apache.hadoop.fs.Path)54 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)38 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)37 InputStream (java.io.InputStream)18 IOException (java.io.IOException)17 Test (org.junit.Test)17 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)15 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 ByteString (com.google.protobuf.ByteString)5 DataInputStream (java.io.DataInputStream)5