Search in sources :

Example 91 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project accumulo by apache.

the class CompressionTest method testThereCanBeOnlyOne.

@Test(timeout = 60 * 1000)
public void testThereCanBeOnlyOne() throws IOException, InterruptedException, ExecutionException {
    for (final Algorithm al : Algorithm.values()) {
        if (isSupported.get(al) != null && isSupported.get(al)) {
            // first call to issupported should be true
            Assert.assertTrue(al + " is not supported, but should be", al.isSupported());
            ExecutorService service = Executors.newFixedThreadPool(20);
            ArrayList<Callable<Boolean>> list = new ArrayList<>();
            ArrayList<Future<Boolean>> results = new ArrayList<>();
            // keep track of the system's identity hashcodes.
            final HashSet<Integer> testSet = new HashSet<>();
            for (int i = 0; i < 40; i++) {
                list.add(new Callable<Boolean>() {

                    @Override
                    public Boolean call() throws Exception {
                        CompressionCodec codec = al.getCodec();
                        Assert.assertNotNull(al + " resulted in a non-null codec", codec);
                        // add the identity hashcode to the set.
                        synchronized (testSet) {
                            testSet.add(System.identityHashCode(codec));
                        }
                        return true;
                    }
                });
            }
            results.addAll(service.invokeAll(list));
            // ensure that we
            Assert.assertEquals(al + " created too many codecs", 1, testSet.size());
            service.shutdown();
            while (!service.awaitTermination(1, TimeUnit.SECONDS)) {
            // wait
            }
            for (Future<Boolean> result : results) {
                Assert.assertTrue(al + " resulted in a failed call to getcodec within the thread pool", result.get());
            }
        }
    }
}
Also used : ArrayList(java.util.ArrayList) Algorithm(org.apache.accumulo.core.file.rfile.bcfile.Compression.Algorithm) Callable(java.util.concurrent.Callable) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 92 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project elephant-bird by twitter.

the class MultiInputFormat method determineFileFormat.

/**
 * Checks to see if the input records are stored as SerializedBlock.
 * The block format starts with {@link Protobufs#KNOWN_GOOD_POSITION_MARKER}.
 * Otherwise the input is assumed to be Base64 encoded lines.
 */
private static Format determineFileFormat(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path file = fileSplit.getPath();
    /* we could have a an optional configuration that maps a regex on a
     * file name to a format. E.g. ".*-block.lzo" to LZO_BLOCK file.
     */
    // most of the cost is opening the file and
    // reading first lzo block (about 256k of uncompressed data)
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
    if (codec == null) {
        throw new IOException("No codec for file " + file + " found");
    }
    InputStream in = file.getFileSystem(conf).open(file);
    InputStream lzoIn = null;
    // check if the file starts with magic bytes for Block storage format.
    try {
        lzoIn = codec.createInputStream(in);
        for (byte magic : Protobufs.KNOWN_GOOD_POSITION_MARKER) {
            int b = lzoIn.read();
            if (b < 0 || (byte) b != magic) {
                return Format.LZO_B64LINE;
            }
        }
    } finally {
        IOUtils.closeStream(lzoIn);
        IOUtils.closeStream(in);
    }
    // the check passed
    return Format.LZO_BLOCK;
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) InputStream(java.io.InputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 93 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project incubator-gobblin by apache.

the class HadoopFsHelper method getFileStream.

/**
 * Returns an {@link InputStream} to the specified file.
 * <p>
 * Note: It is the caller's responsibility to close the returned {@link InputStream}.
 * </p>
 *
 * @param path The path to the file to open.
 * @return An {@link InputStream} for the specified file.
 * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file.
 */
@Override
public InputStream getFileStream(String path) throws FileBasedHelperException {
    try {
        Path p = new Path(path);
        InputStream in = this.getFileSystem().open(p);
        // Account for compressed files (e.g. gzip).
        // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
        CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
        CompressionCodec codec = factory.getCodec(p);
        return (codec == null) ? in : codec.createInputStream(in);
    } catch (IOException e) {
        throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileBasedHelperException(org.apache.gobblin.source.extractor.filebased.FileBasedHelperException) InputStream(java.io.InputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) IOException(java.io.IOException)

Example 94 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hazelcast by hazelcast.

the class JsonInputFormat method isSplitable.

@Override
protected boolean isSplitable(JobContext context, Path file) {
    boolean multiline = acceptMultilineJson(context.getConfiguration());
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return ((null == codec) || (codec instanceof SplittableCompressionCodec)) && !multiline;
}
Also used : SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec)

Example 95 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project Honu by jboulon.

the class CmdLineConverter method main.

/**
 * @param args
 * @throws ClassNotFoundException
 */
@SuppressWarnings("unchecked")
public static void main(String[] args) throws ClassNotFoundException {
    if (args.length != 3) {
        System.out.println("java org.honu.inputtools.converter.CmdLineConverter <dataType> <codec> <outputFile>");
        System.out.println("codec: NONE , for uncompressed seqFile");
        System.out.println("codec: org.apache.hadoop.io.compress.GzipCodec , for GZIP compressed seqFile");
        System.out.println("codec: org.apache.hadoop.io.compress.LzoCodec , for LZO compressed seqFile");
        System.exit(-1);
    }
    String dataType = args[0];
    String codecClass = args[1];
    String outpFileName = args[2];
    if (codecClass.equalsIgnoreCase("none")) {
        codecClass = null;
    }
    int lineCount = 0;
    Path newOutputPath = null;
    try {
        BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.getLocal(conf);
        newOutputPath = new Path(outpFileName);
        CompressionCodec codec = null;
        if (codecClass != null) {
            Class classDefinition = Class.forName(codecClass);
            codec = (CompressionCodec) ReflectionUtils.newInstance(classDefinition, conf);
        }
        FSDataOutputStream newOutputStr = fs.create(newOutputPath);
        SequenceFile.Writer seqFileWriter = null;
        if (codec != null) {
            seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.BLOCK, codec);
        } else {
            seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.NONE, codec);
        }
        String str = null;
        ChunkBuilder cb = null;
        do {
            str = in.readLine();
            if (str != null) {
                lineCount++;
                if (cb == null) {
                    cb = new ChunkBuilder();
                }
                cb.addRecord(str.getBytes());
                if (lineCount % 300 == 0) {
                    append(seqFileWriter, getChunk(cb, dataType));
                    cb = null;
                }
            }
        } while (str != null);
        if (cb != null) {
            append(seqFileWriter, getChunk(cb, dataType));
        }
        seqFileWriter.close();
        newOutputStr.close();
    } catch (Throwable e) {
        e.printStackTrace();
        System.exit(-1);
    }
    System.out.println(new java.util.Date() + ", CmdLineConverter [" + dataType + "] [" + newOutputPath + "], Total lineCount: " + lineCount);
    System.exit(0);
}
Also used : Path(org.apache.hadoop.fs.Path) ChukwaArchiveKey(org.apache.hadoop.chukwa.ChukwaArchiveKey) InputStreamReader(java.io.InputStreamReader) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) ChunkImpl(org.apache.hadoop.chukwa.ChunkImpl) FileSystem(org.apache.hadoop.fs.FileSystem) ChunkBuilder(org.apache.hadoop.chukwa.ChunkBuilder) BufferedReader(java.io.BufferedReader) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)111 Path (org.apache.hadoop.fs.Path)54 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)38 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)37 InputStream (java.io.InputStream)18 IOException (java.io.IOException)17 Test (org.junit.Test)17 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)15 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 ByteString (com.google.protobuf.ByteString)5 DataInputStream (java.io.DataInputStream)5