Search in sources :

Example 1 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project druid by druid-io.

the class Utils method exists.

public static boolean exists(JobContext job, FileSystem fs, Path inputPath) throws IOException {
    if (!FileOutputFormat.getCompressOutput(job)) {
        return fs.exists(inputPath);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        return fs.exists(new Path(inputPath.toString() + codec.getDefaultExtension()));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 2 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project druid by druid-io.

the class Utils method openInputStream.

public static InputStream openInputStream(JobContext job, Path inputPath, final FileSystem fileSystem) throws IOException {
    if (!FileOutputFormat.getCompressOutput(job)) {
        return fileSystem.open(inputPath);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        inputPath = new Path(inputPath.toString() + codec.getDefaultExtension());
        return codec.createInputStream(fileSystem.open(inputPath));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 3 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestCombineTextInputFormat method testGzip.

/**
   * Test using the gzip codec for reading
   */
@Test(timeout = 10000)
public void testGzip() throws IOException, InterruptedException {
    Configuration conf = new Configuration(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, conf);
    localFs.delete(workDir, true);
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
    Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, workDir);
    CombineTextInputFormat format = new CombineTextInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    assertEquals("compressed splits == 1", 1, splits.size());
    List<Text> results = readSplit(format, splits.get(0), job);
    assertEquals("splits[0] length", 8, results.size());
    final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
    final String[] secondList = { "this is a test", "of gzip" };
    String first = results.get(0).toString();
    if (first.equals(firstList[0])) {
        testResults(results, firstList, secondList);
    } else if (first.equals(secondList[0])) {
        testResults(results, secondList, firstList);
    } else {
        fail("unexpected first token!");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 4 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class InputStriper method splitFor.

/**
   * @param inputDir Pool used to resolve block locations.
   * @param bytes Target byte count
   * @param nLocs Number of block locations per split.
   * @return A set of files satisfying the byte count, with locations weighted
   *         to the dominating proportion of input bytes.
   */
CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs) throws IOException {
    final ArrayList<Path> paths = new ArrayList<Path>();
    final ArrayList<Long> start = new ArrayList<Long>();
    final ArrayList<Long> length = new ArrayList<Long>();
    final HashMap<String, Double> sb = new HashMap<String, Double>();
    do {
        paths.add(current.getPath());
        start.add(currentStart);
        final long fromFile = Math.min(bytes, current.getLen() - currentStart);
        length.add(fromFile);
        for (BlockLocation loc : inputDir.locationsFor(current, currentStart, fromFile)) {
            final double tedium = loc.getLength() / (1.0 * bytes);
            for (String l : loc.getHosts()) {
                Double j = sb.get(l);
                if (null == j) {
                    sb.put(l, tedium);
                } else {
                    sb.put(l, j.doubleValue() + tedium);
                }
            }
        }
        currentStart += fromFile;
        bytes -= fromFile;
        // Switch to a new file if
        //  - the current file is uncompressed and completely used
        //  - the current file is compressed
        CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
        if (current.getLen() - currentStart == 0 || codec != null) {
            current = files.get(++idx % files.size());
            currentStart = 0;
        }
    } while (bytes > 0);
    final ArrayList<Entry<String, Double>> sort = new ArrayList<Entry<String, Double>>(sb.entrySet());
    Collections.sort(sort, hostRank);
    final String[] hosts = new String[Math.min(nLocs, sort.size())];
    for (int i = 0; i < nLocs && i < sort.size(); ++i) {
        hosts[i] = sort.get(i).getKey();
    }
    return new CombineFileSplit(paths.toArray(new Path[0]), toLongArray(start), toLongArray(length), hosts);
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) BlockLocation(org.apache.hadoop.fs.BlockLocation) Entry(java.util.Map.Entry) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 5 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class CompressionEmulationUtil method getPossiblyDecompressedInputStream.

/**
   * Returns a {@link InputStream} for a file that might be compressed.
   */
static InputStream getPossiblyDecompressedInputStream(Path file, Configuration conf, long offset) throws IOException {
    FileSystem fs = file.getFileSystem(conf);
    if (isCompressionEmulationEnabled(conf) && isInputCompressionEmulationEnabled(conf)) {
        CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecs.getCodec(file);
        if (codec != null) {
            Decompressor decompressor = CodecPool.getDecompressor(codec);
            if (decompressor != null) {
                CompressionInputStream in = codec.createInputStream(fs.open(file), decompressor);
                //     Use SplittableCompressionCodec?
                return (InputStream) in;
            }
        }
    }
    FSDataInputStream in = fs.open(file);
    in.seek(offset);
    return (InputStream) in;
}
Also used : Decompressor(org.apache.hadoop.io.compress.Decompressor) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) InputStream(java.io.InputStream) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)111 Path (org.apache.hadoop.fs.Path)54 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)38 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)37 InputStream (java.io.InputStream)18 IOException (java.io.IOException)17 Test (org.junit.Test)17 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)15 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 ByteString (com.google.protobuf.ByteString)5 DataInputStream (java.io.DataInputStream)5