use of org.apache.hadoop.io.compress.CompressionCodec in project druid by druid-io.
the class Utils method exists.
public static boolean exists(JobContext job, FileSystem fs, Path inputPath) throws IOException {
if (!FileOutputFormat.getCompressOutput(job)) {
return fs.exists(inputPath);
} else {
Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
return fs.exists(new Path(inputPath.toString() + codec.getDefaultExtension()));
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project druid by druid-io.
the class Utils method openInputStream.
public static InputStream openInputStream(JobContext job, Path inputPath, final FileSystem fileSystem) throws IOException {
if (!FileOutputFormat.getCompressOutput(job)) {
return fileSystem.open(inputPath);
} else {
Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class);
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
inputPath = new Path(inputPath.toString() + codec.getDefaultExtension());
return codec.createInputStream(fileSystem.open(inputPath));
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestCombineTextInputFormat method testGzip.
/**
* Test using the gzip codec for reading
*/
@Test(timeout = 10000)
public void testGzip() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, conf);
localFs.delete(workDir, true);
writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
Job job = Job.getInstance(conf);
FileInputFormat.setInputPaths(job, workDir);
CombineTextInputFormat format = new CombineTextInputFormat();
List<InputSplit> splits = format.getSplits(job);
assertEquals("compressed splits == 1", 1, splits.size());
List<Text> results = readSplit(format, splits.get(0), job);
assertEquals("splits[0] length", 8, results.size());
final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
final String[] secondList = { "this is a test", "of gzip" };
String first = results.get(0).toString();
if (first.equals(firstList[0])) {
testResults(results, firstList, secondList);
} else if (first.equals(secondList[0])) {
testResults(results, secondList, firstList);
} else {
fail("unexpected first token!");
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class InputStriper method splitFor.
/**
* @param inputDir Pool used to resolve block locations.
* @param bytes Target byte count
* @param nLocs Number of block locations per split.
* @return A set of files satisfying the byte count, with locations weighted
* to the dominating proportion of input bytes.
*/
CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs) throws IOException {
final ArrayList<Path> paths = new ArrayList<Path>();
final ArrayList<Long> start = new ArrayList<Long>();
final ArrayList<Long> length = new ArrayList<Long>();
final HashMap<String, Double> sb = new HashMap<String, Double>();
do {
paths.add(current.getPath());
start.add(currentStart);
final long fromFile = Math.min(bytes, current.getLen() - currentStart);
length.add(fromFile);
for (BlockLocation loc : inputDir.locationsFor(current, currentStart, fromFile)) {
final double tedium = loc.getLength() / (1.0 * bytes);
for (String l : loc.getHosts()) {
Double j = sb.get(l);
if (null == j) {
sb.put(l, tedium);
} else {
sb.put(l, j.doubleValue() + tedium);
}
}
}
currentStart += fromFile;
bytes -= fromFile;
// Switch to a new file if
// - the current file is uncompressed and completely used
// - the current file is compressed
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
if (current.getLen() - currentStart == 0 || codec != null) {
current = files.get(++idx % files.size());
currentStart = 0;
}
} while (bytes > 0);
final ArrayList<Entry<String, Double>> sort = new ArrayList<Entry<String, Double>>(sb.entrySet());
Collections.sort(sort, hostRank);
final String[] hosts = new String[Math.min(nLocs, sort.size())];
for (int i = 0; i < nLocs && i < sort.size(); ++i) {
hosts[i] = sort.get(i).getKey();
}
return new CombineFileSplit(paths.toArray(new Path[0]), toLongArray(start), toLongArray(length), hosts);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class CompressionEmulationUtil method getPossiblyDecompressedInputStream.
/**
* Returns a {@link InputStream} for a file that might be compressed.
*/
static InputStream getPossiblyDecompressedInputStream(Path file, Configuration conf, long offset) throws IOException {
FileSystem fs = file.getFileSystem(conf);
if (isCompressionEmulationEnabled(conf) && isInputCompressionEmulationEnabled(conf)) {
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
CompressionCodec codec = compressionCodecs.getCodec(file);
if (codec != null) {
Decompressor decompressor = CodecPool.getDecompressor(codec);
if (decompressor != null) {
CompressionInputStream in = codec.createInputStream(fs.open(file), decompressor);
// Use SplittableCompressionCodec?
return (InputStream) in;
}
}
}
FSDataInputStream in = fs.open(file);
in.seek(offset);
return (InputStream) in;
}
Aggregations