Search in sources :

Example 16 with CompressionCodecFactory

use of org.apache.hadoop.io.compress.CompressionCodecFactory in project storm by apache.

the class SequenceFileBolt method doPrepare.

@Override
public void doPrepare(Map conf, TopologyContext topologyContext, OutputCollector collector) throws IOException {
    LOG.info("Preparing Sequence File Bolt...");
    if (this.format == null)
        throw new IllegalStateException("SequenceFormat must be specified.");
    this.fs = FileSystem.get(URI.create(this.fsUrl), hdfsConfig);
    this.codecFactory = new CompressionCodecFactory(hdfsConfig);
}
Also used : CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory)

Example 17 with CompressionCodecFactory

use of org.apache.hadoop.io.compress.CompressionCodecFactory in project presto by prestodb.

the class TestOrcPageSourceMemoryTracking method createTestFile.

public static FileSplit createTestFile(String filePath, HiveOutputFormat<?, ?> outputFormat, @SuppressWarnings("deprecation") SerDe serDe, String compressionCodec, List<TestColumn> testColumns, int numRows) throws Exception {
    // filter out partition keys, which are not written to the file
    testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
    tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
    serDe.initialize(CONFIGURATION, tableProperties);
    JobConf jobConf = new JobConf();
    if (compressionCodec != null) {
        CompressionCodec codec = new CompressionCodecFactory(CONFIGURATION).getCodecByName(compressionCodec);
        jobConf.set(COMPRESS_CODEC, codec.getClass().getName());
        jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString());
    }
    RecordWriter recordWriter = createRecordWriter(new Path(filePath), CONFIGURATION);
    try {
        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector)));
        Object row = objectInspector.create();
        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
        for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
            for (int i = 0; i < testColumns.size(); i++) {
                Object writeValue = testColumns.get(i).getWriteValue();
                if (writeValue instanceof Slice) {
                    writeValue = ((Slice) writeValue).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), writeValue);
            }
            Writable record = serDe.serialize(row, objectInspector);
            recordWriter.write(record);
            if (rowNumber % STRIPE_ROWS == STRIPE_ROWS - 1) {
                flushStripe(recordWriter);
            }
        }
    } finally {
        recordWriter.close(false);
    }
    Path path = new Path(filePath);
    path.getFileSystem(CONFIGURATION).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}
Also used : Path(org.apache.hadoop.fs.Path) Writable(org.apache.hadoop.io.Writable) Properties(java.util.Properties) FileSplit(org.apache.hadoop.mapred.FileSplit) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) Slice(io.airlift.slice.Slice) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 18 with CompressionCodecFactory

use of org.apache.hadoop.io.compress.CompressionCodecFactory in project hadoop by apache.

the class CompressionEmulationUtil method configureCompressionEmulation.

/**
   * Extracts compression/decompression related configuration parameters from 
   * the source configuration to the target configuration.
   */
static void configureCompressionEmulation(Configuration source, Configuration target) {
    // enable output compression
    target.setBoolean(FileOutputFormat.COMPRESS, source.getBoolean(FileOutputFormat.COMPRESS, false));
    // set the job output compression codec
    String jobOutputCompressionCodec = source.get(FileOutputFormat.COMPRESS_CODEC);
    if (jobOutputCompressionCodec != null) {
        target.set(FileOutputFormat.COMPRESS_CODEC, jobOutputCompressionCodec);
    }
    // set the job output compression type
    String jobOutputCompressionType = source.get(FileOutputFormat.COMPRESS_TYPE);
    if (jobOutputCompressionType != null) {
        target.set(FileOutputFormat.COMPRESS_TYPE, jobOutputCompressionType);
    }
    // enable map output compression
    target.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, source.getBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, false));
    // set the map output compression codecs
    String mapOutputCompressionCodec = source.get(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC);
    if (mapOutputCompressionCodec != null) {
        target.set(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, mapOutputCompressionCodec);
    }
    // enable input decompression
    //TODO replace with mapInputBytes and hdfsBytesRead
    Path[] inputs = org.apache.hadoop.mapred.FileInputFormat.getInputPaths(new JobConf(source));
    boolean needsCompressedInput = false;
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(source);
    for (Path input : inputs) {
        CompressionCodec codec = compressionCodecs.getCodec(input);
        if (codec != null) {
            needsCompressedInput = true;
        }
    }
    setInputCompressionEmulationEnabled(target, needsCompressedInput);
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf)

Example 19 with CompressionCodecFactory

use of org.apache.hadoop.io.compress.CompressionCodecFactory in project hadoop by apache.

the class Anonymizer method createJsonGenerator.

// Creates a JSON generator
private JsonGenerator createJsonGenerator(Configuration conf, Path path) throws IOException {
    FileSystem outFS = path.getFileSystem(conf);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
    OutputStream output;
    Compressor compressor = null;
    if (codec != null) {
        compressor = CodecPool.getCompressor(codec);
        output = codec.createOutputStream(outFS.create(path), compressor);
    } else {
        output = outFS.create(path);
    }
    JsonGenerator outGen = outFactory.createGenerator(output, JsonEncoding.UTF8);
    outGen.useDefaultPrettyPrinter();
    return outGen;
}
Also used : CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStream(java.io.OutputStream) Compressor(org.apache.hadoop.io.compress.Compressor) JsonGenerator(com.fasterxml.jackson.core.JsonGenerator) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 20 with CompressionCodecFactory

use of org.apache.hadoop.io.compress.CompressionCodecFactory in project carbondata by apache.

the class FileFactory method getDataInputStream.

public static DataInputStream getDataInputStream(String path, FileType fileType, int bufferSize) throws IOException {
    path = path.replace("\\", "/");
    boolean gzip = path.endsWith(".gz");
    boolean bzip2 = path.endsWith(".bz2");
    InputStream stream;
    switch(fileType) {
        case LOCAL:
            path = getUpdatedFilePath(path, fileType);
            if (gzip) {
                stream = new GZIPInputStream(new FileInputStream(path));
            } else if (bzip2) {
                stream = new BZip2CompressorInputStream(new FileInputStream(path));
            } else {
                stream = new FileInputStream(path);
            }
            break;
        case HDFS:
        case ALLUXIO:
        case VIEWFS:
            Path pt = new Path(path);
            FileSystem fs = pt.getFileSystem(configuration);
            if (bufferSize == -1) {
                stream = fs.open(pt);
            } else {
                stream = fs.open(pt, bufferSize);
            }
            String codecName = null;
            if (gzip) {
                codecName = GzipCodec.class.getName();
            } else if (bzip2) {
                codecName = BZip2Codec.class.getName();
            }
            if (null != codecName) {
                CompressionCodecFactory ccf = new CompressionCodecFactory(configuration);
                CompressionCodec codec = ccf.getCodecByClassName(codecName);
                stream = codec.createInputStream(stream);
            }
            break;
        default:
            throw new UnsupportedOperationException("unsupported file system");
    }
    return new DataInputStream(new BufferedInputStream(stream));
}
Also used : Path(org.apache.hadoop.fs.Path) DataInputStream(java.io.DataInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) InputStream(java.io.InputStream) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) DataInputStream(java.io.DataInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileInputStream(java.io.FileInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) BufferedInputStream(java.io.BufferedInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)22 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)18 FileSystem (org.apache.hadoop.fs.FileSystem)14 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)9 Path (org.apache.hadoop.fs.Path)9 Configuration (org.apache.hadoop.conf.Configuration)7 IOException (java.io.IOException)6 DataInputStream (java.io.DataInputStream)4 Text (org.apache.hadoop.io.Text)3 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)3 LineReader (org.apache.hadoop.util.LineReader)3 InputStream (java.io.InputStream)2 OutputStream (java.io.OutputStream)2 PcapReader (net.ripe.hadoop.pcap.PcapReader)2 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)2 JobConf (org.apache.hadoop.mapred.JobConf)2 RDFParserBuilder (org.apache.jena.riot.RDFParserBuilder)2 JsonGenerator (com.fasterxml.jackson.core.JsonGenerator)1 Slice (io.airlift.slice.Slice)1 BufferedInputStream (java.io.BufferedInputStream)1