use of org.apache.hadoop.io.compress.CompressionCodec in project accumulo by apache.
the class CompressionTest method testThereCanBeOnlyOne.
@Test(timeout = 60 * 1000)
public void testThereCanBeOnlyOne() throws IOException, InterruptedException, ExecutionException {
for (final Algorithm al : Algorithm.values()) {
if (isSupported.get(al) != null && isSupported.get(al)) {
// first call to issupported should be true
Assert.assertTrue(al + " is not supported, but should be", al.isSupported());
ExecutorService service = Executors.newFixedThreadPool(20);
ArrayList<Callable<Boolean>> list = new ArrayList<>();
ArrayList<Future<Boolean>> results = new ArrayList<>();
// keep track of the system's identity hashcodes.
final HashSet<Integer> testSet = new HashSet<>();
for (int i = 0; i < 40; i++) {
list.add(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
CompressionCodec codec = al.getCodec();
Assert.assertNotNull(al + " resulted in a non-null codec", codec);
// add the identity hashcode to the set.
synchronized (testSet) {
testSet.add(System.identityHashCode(codec));
}
return true;
}
});
}
results.addAll(service.invokeAll(list));
// ensure that we
Assert.assertEquals(al + " created too many codecs", 1, testSet.size());
service.shutdown();
while (!service.awaitTermination(1, TimeUnit.SECONDS)) {
// wait
}
for (Future<Boolean> result : results) {
Assert.assertTrue(al + " resulted in a failed call to getcodec within the thread pool", result.get());
}
}
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project elephant-bird by twitter.
the class MultiInputFormat method determineFileFormat.
/**
* Checks to see if the input records are stored as SerializedBlock.
* The block format starts with {@link Protobufs#KNOWN_GOOD_POSITION_MARKER}.
* Otherwise the input is assumed to be Base64 encoded lines.
*/
private static Format determineFileFormat(InputSplit split, Configuration conf) throws IOException {
FileSplit fileSplit = (FileSplit) split;
Path file = fileSplit.getPath();
/* we could have a an optional configuration that maps a regex on a
* file name to a format. E.g. ".*-block.lzo" to LZO_BLOCK file.
*/
// most of the cost is opening the file and
// reading first lzo block (about 256k of uncompressed data)
CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
if (codec == null) {
throw new IOException("No codec for file " + file + " found");
}
InputStream in = file.getFileSystem(conf).open(file);
InputStream lzoIn = null;
// check if the file starts with magic bytes for Block storage format.
try {
lzoIn = codec.createInputStream(in);
for (byte magic : Protobufs.KNOWN_GOOD_POSITION_MARKER) {
int b = lzoIn.read();
if (b < 0 || (byte) b != magic) {
return Format.LZO_B64LINE;
}
}
} finally {
IOUtils.closeStream(lzoIn);
IOUtils.closeStream(in);
}
// the check passed
return Format.LZO_BLOCK;
}
use of org.apache.hadoop.io.compress.CompressionCodec in project incubator-gobblin by apache.
the class HadoopFsHelper method getFileStream.
/**
* Returns an {@link InputStream} to the specified file.
* <p>
* Note: It is the caller's responsibility to close the returned {@link InputStream}.
* </p>
*
* @param path The path to the file to open.
* @return An {@link InputStream} for the specified file.
* @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file.
*/
@Override
public InputStream getFileStream(String path) throws FileBasedHelperException {
try {
Path p = new Path(path);
InputStream in = this.getFileSystem().open(p);
// Account for compressed files (e.g. gzip).
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
CompressionCodec codec = factory.getCodec(p);
return (codec == null) ? in : codec.createInputStream(in);
} catch (IOException e) {
throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e);
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hazelcast by hazelcast.
the class JsonInputFormat method isSplitable.
@Override
protected boolean isSplitable(JobContext context, Path file) {
boolean multiline = acceptMultilineJson(context.getConfiguration());
final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
return ((null == codec) || (codec instanceof SplittableCompressionCodec)) && !multiline;
}
use of org.apache.hadoop.io.compress.CompressionCodec in project Honu by jboulon.
the class CmdLineConverter method main.
/**
* @param args
* @throws ClassNotFoundException
*/
@SuppressWarnings("unchecked")
public static void main(String[] args) throws ClassNotFoundException {
if (args.length != 3) {
System.out.println("java org.honu.inputtools.converter.CmdLineConverter <dataType> <codec> <outputFile>");
System.out.println("codec: NONE , for uncompressed seqFile");
System.out.println("codec: org.apache.hadoop.io.compress.GzipCodec , for GZIP compressed seqFile");
System.out.println("codec: org.apache.hadoop.io.compress.LzoCodec , for LZO compressed seqFile");
System.exit(-1);
}
String dataType = args[0];
String codecClass = args[1];
String outpFileName = args[2];
if (codecClass.equalsIgnoreCase("none")) {
codecClass = null;
}
int lineCount = 0;
Path newOutputPath = null;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
newOutputPath = new Path(outpFileName);
CompressionCodec codec = null;
if (codecClass != null) {
Class classDefinition = Class.forName(codecClass);
codec = (CompressionCodec) ReflectionUtils.newInstance(classDefinition, conf);
}
FSDataOutputStream newOutputStr = fs.create(newOutputPath);
SequenceFile.Writer seqFileWriter = null;
if (codec != null) {
seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.BLOCK, codec);
} else {
seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.NONE, codec);
}
String str = null;
ChunkBuilder cb = null;
do {
str = in.readLine();
if (str != null) {
lineCount++;
if (cb == null) {
cb = new ChunkBuilder();
}
cb.addRecord(str.getBytes());
if (lineCount % 300 == 0) {
append(seqFileWriter, getChunk(cb, dataType));
cb = null;
}
}
} while (str != null);
if (cb != null) {
append(seqFileWriter, getChunk(cb, dataType));
}
seqFileWriter.close();
newOutputStr.close();
} catch (Throwable e) {
e.printStackTrace();
System.exit(-1);
}
System.out.println(new java.util.Date() + ", CmdLineConverter [" + dataType + "] [" + newOutputPath + "], Total lineCount: " + lineCount);
System.exit(0);
}
Aggregations