Search in sources :

Example 26 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project jena by apache.

the class AbstractNodeOutputFormat method getRecordWriter.

@Override
public RecordWriter<NodeWritable, TValue> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration config = context.getConfiguration();
    boolean isCompressed = getCompressOutput(context);
    CompressionCodec codec = null;
    String extension = this.getFileExtension();
    if (isCompressed) {
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, config);
        extension += codec.getDefaultExtension();
    }
    Path file = getDefaultWorkFile(context, extension);
    LOG.info("Writing output to file " + file);
    FileSystem fs = file.getFileSystem(config);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return this.getRecordWriter(new OutputStreamWriter(fileOut), config);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return this.getRecordWriter(new OutputStreamWriter(codec.createOutputStream(fileOut)), config);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStreamWriter(java.io.OutputStreamWriter) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 27 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project jena by apache.

the class AbstractNodeTupleOutputFormat method getRecordWriter.

@Override
public RecordWriter<TKey, T> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration config = context.getConfiguration();
    boolean isCompressed = getCompressOutput(context);
    CompressionCodec codec = null;
    // Build the output file path
    String extension = this.getFileExtension();
    if (isCompressed) {
        // Add compression extension if applicable
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(context, GzipCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, config);
        extension += codec.getDefaultExtension();
    }
    Path file = getDefaultWorkFile(context, extension);
    LOG.info("Writing output to file " + file);
    // Open the file appropriately and create a record writer for it
    FileSystem fs = file.getFileSystem(config);
    if (!isCompressed) {
        FSDataOutputStream fileOut = fs.create(file, false);
        return this.getRecordWriter(new OutputStreamWriter(fileOut), config, file);
    } else {
        FSDataOutputStream fileOut = fs.create(file, false);
        return this.getRecordWriter(new OutputStreamWriter(codec.createOutputStream(fileOut)), config, file);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStreamWriter(java.io.OutputStreamWriter) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 28 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project flink by apache.

the class SequenceFileWriter method open.

@Override
public void open(FileSystem fs, Path path) throws IOException {
    super.open(fs, path);
    if (keyClass == null) {
        throw new IllegalStateException("Key Class has not been initialized.");
    }
    if (valueClass == null) {
        throw new IllegalStateException("Value Class has not been initialized.");
    }
    CompressionCodec codec = null;
    Configuration conf = HadoopFileSystem.getHadoopConfiguration();
    if (!compressionCodecName.equals("None")) {
        CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
        codec = codecFactory.getCodecByName(compressionCodecName);
        if (codec == null) {
            throw new RuntimeException("Codec " + compressionCodecName + " not found.");
        }
    }
    // the non-deprecated constructor syntax is only available in recent hadoop versions...
    writer = SequenceFile.createWriter(conf, getStream(), keyClass, valueClass, compressionType, codec);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 29 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project nifi by apache.

the class GetHDFS method processBatchOfFiles.

protected void processBatchOfFiles(final List<Path> files, final ProcessContext context, final ProcessSession session) {
    // process the batch of files
    InputStream stream = null;
    CompressionCodec codec = null;
    Configuration conf = getConfiguration();
    FileSystem hdfs = getFileSystem();
    final boolean keepSourceFiles = context.getProperty(KEEP_SOURCE_FILE).asBoolean();
    final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
    int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue() : conf.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);
    final Path rootDir = new Path(context.getProperty(DIRECTORY).evaluateAttributeExpressions().getValue());
    final CompressionType compressionType = CompressionType.valueOf(context.getProperty(COMPRESSION_CODEC).toString());
    final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;
    if (inferCompressionCodec || compressionType != CompressionType.NONE) {
        codec = getCompressionCodec(context, getConfiguration());
    }
    final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
    for (final Path file : files) {
        try {
            if (!getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.exists(file))) {
                // if file is no longer there then move on
                continue;
            }
            final String originalFilename = file.getName();
            final String relativePath = getPathDifference(rootDir, file);
            stream = getUserGroupInformation().doAs((PrivilegedExceptionAction<FSDataInputStream>) () -> hdfs.open(file, bufferSize));
            final String outputFilename;
            // Check if we should infer compression codec
            if (inferCompressionCodec) {
                codec = compressionCodecFactory.getCodec(file);
            }
            // Check if compression codec is defined (inferred or otherwise)
            if (codec != null) {
                stream = codec.createInputStream(stream);
                outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
            } else {
                outputFilename = originalFilename;
            }
            FlowFile flowFile = session.create();
            final StopWatch stopWatch = new StopWatch(true);
            flowFile = session.importFrom(stream, flowFile);
            stopWatch.stop();
            final String dataRate = stopWatch.calculateDataRate(flowFile.getSize());
            final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
            flowFile = session.putAttribute(flowFile, CoreAttributes.PATH.key(), relativePath.isEmpty() ? "." : relativePath);
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);
            if (!keepSourceFiles && !getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.delete(file, false))) {
                getLogger().warn("Could not remove {} from HDFS. Not ingesting this file ...", new Object[] { file });
                session.remove(flowFile);
                continue;
            }
            session.getProvenanceReporter().receive(flowFile, file.toString());
            session.transfer(flowFile, REL_SUCCESS);
            getLogger().info("retrieved {} from HDFS {} in {} milliseconds at a rate of {}", new Object[] { flowFile, file, millis, dataRate });
            session.commit();
        } catch (final Throwable t) {
            getLogger().error("Error retrieving file {} from HDFS due to {}", new Object[] { file, t });
            session.rollback();
            context.yield();
        } finally {
            IOUtils.closeQuietly(stream);
            stream = null;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) Configuration(org.apache.hadoop.conf.Configuration) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) InputStream(java.io.InputStream) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) StopWatch(org.apache.nifi.util.StopWatch) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 30 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project apex-malhar by apache.

the class AbstractFileOutputOperatorTest method checkSnappyFile.

private void checkSnappyFile(File file, List<Long> offsets, int startVal, int totalWindows, int totalRecords) throws IOException {
    FileInputStream fis;
    InputStream gss = null;
    Configuration conf = new Configuration();
    CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(SnappyCodec.class, conf);
    CompressionInputStream snappyIs = null;
    BufferedReader br = null;
    int numWindows = 0;
    try {
        fis = new FileInputStream(file);
        gss = fis;
        long startOffset = 0;
        for (long offset : offsets) {
            // Skip initial case in case file is not yet created
            if (offset == 0) {
                continue;
            }
            long limit = offset - startOffset;
            LimitInputStream lis = new LimitInputStream(gss, limit);
            snappyIs = codec.createInputStream(lis);
            br = new BufferedReader(new InputStreamReader(snappyIs));
            String eline = "" + (startVal + numWindows * 2);
            int count = 0;
            String line;
            while ((line = br.readLine()) != null) {
                Assert.assertEquals("File line", eline, line);
                ++count;
                if ((count % totalRecords) == 0) {
                    ++numWindows;
                    eline = "" + (startVal + numWindows * 2);
                }
            }
            startOffset = offset;
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        if (br != null) {
            br.close();
        } else {
            if (snappyIs != null) {
                snappyIs.close();
            } else if (gss != null) {
                gss.close();
            }
        }
    }
    Assert.assertEquals("Total", totalWindows, numWindows);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) InputStreamReader(java.io.InputStreamReader) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) LimitInputStream(com.google.common.io.LimitInputStream) CipherInputStream(javax.crypto.CipherInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) LimitInputStream(com.google.common.io.LimitInputStream) FileInputStream(java.io.FileInputStream) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) IOException(java.io.IOException) ConstraintViolationException(javax.validation.ConstraintViolationException) BufferedReader(java.io.BufferedReader) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) SnappyCodec(org.apache.hadoop.io.compress.SnappyCodec)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)110 Path (org.apache.hadoop.fs.Path)53 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)37 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)36 InputStream (java.io.InputStream)17 Test (org.junit.Test)17 IOException (java.io.IOException)16 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)14 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)6 ByteString (com.google.protobuf.ByteString)5