use of org.apache.hadoop.io.compress.Compressor in project hadoop by apache.
the class TestZStandardCompressorDecompressor method testCompressingWithOneByteOutputBuffer.
@Test
public void testCompressingWithOneByteOutputBuffer() throws Exception {
int uncompressedSize = (int) FileUtils.sizeOf(uncompressedFile);
byte[] bytes = FileUtils.readFileToByteArray(uncompressedFile);
assertEquals(uncompressedSize, bytes.length);
Configuration conf = new Configuration();
ZStandardCodec codec = new ZStandardCodec();
codec.setConf(conf);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Compressor compressor = new ZStandardCompressor(3, IO_FILE_BUFFER_SIZE_DEFAULT, 1);
CompressionOutputStream outputStream = codec.createOutputStream(baos, compressor);
for (byte aByte : bytes) {
outputStream.write(aByte);
}
outputStream.finish();
outputStream.close();
assertEquals(uncompressedSize, compressor.getBytesRead());
assertTrue(compressor.finished());
// just make sure we can decompress the file
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
Decompressor decompressor = codec.createDecompressor();
CompressionInputStream inputStream = codec.createInputStream(bais, decompressor);
byte[] buffer = new byte[100];
int n = buffer.length;
while ((n = inputStream.read(buffer, 0, n)) != -1) {
byteArrayOutputStream.write(buffer, 0, n);
}
assertArrayEquals(bytes, byteArrayOutputStream.toByteArray());
}
use of org.apache.hadoop.io.compress.Compressor in project hadoop by apache.
the class Anonymizer method createJsonGenerator.
// Creates a JSON generator
private JsonGenerator createJsonGenerator(Configuration conf, Path path) throws IOException {
FileSystem outFS = path.getFileSystem(conf);
CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
OutputStream output;
Compressor compressor = null;
if (codec != null) {
compressor = CodecPool.getCompressor(codec);
output = codec.createOutputStream(outFS.create(path), compressor);
} else {
output = outFS.create(path);
}
JsonGenerator outGen = outFactory.createGenerator(output, JsonEncoding.UTF8);
outGen.useDefaultPrettyPrinter();
return outGen;
}
use of org.apache.hadoop.io.compress.Compressor in project mongo-hadoop by mongodb.
the class BSONSplitter method run.
/**
* When run as a Tool, BSONSplitter can be used to pre-split and compress
* BSON files. This can be especially useful before uploading large BSON
* files to HDFS to save time. The compressed splits are written to the
* given output path or to the directory containing the input file, if
* the output path is unspecified. A ".splits" file is not generated, since
* each output file is expected to be its own split.
*
* @param args command-line arguments. Run with zero arguments to see usage.
* @return exit status
* @throws Exception
*/
@Override
public int run(final String[] args) throws Exception {
if (args.length < 1) {
printUsage();
return 1;
}
// Parse command-line arguments.
Path filePath = new Path(args[0]);
String compressorName = null, outputDirectoryStr = null;
Path outputDirectory;
CompressionCodec codec;
Compressor compressor;
for (int i = 1; i < args.length; ++i) {
if ("-c".equals(args[i]) && args.length > i) {
compressorName = args[++i];
} else if ("-o".equals(args[i]) && args.length > i) {
outputDirectoryStr = args[++i];
} else {
// CHECKSTYLE:OFF
System.err.println("unrecognized option: " + args[i]);
// CHECKSTYLE:ON
printUsage();
return 1;
}
}
// Supply default values for unspecified arguments.
if (null == outputDirectoryStr) {
outputDirectory = filePath.getParent();
} else {
outputDirectory = new Path(outputDirectoryStr);
}
if (null == compressorName) {
codec = new DefaultCodec();
} else {
Class<?> codecClass = Class.forName(compressorName);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, getConf());
}
if (codec instanceof Configurable) {
((Configurable) codec).setConf(getConf());
}
// Do not write a .splits file so as not to confuse BSONSplitter.
// Each compressed file will be its own split.
MongoConfigUtil.setBSONWriteSplits(getConf(), false);
// Open the file.
FileSystem inputFS = FileSystem.get(filePath.toUri(), getConf());
FileSystem outputFS = FileSystem.get(outputDirectory.toUri(), getConf());
FSDataInputStream inputStream = inputFS.open(filePath);
// Use BSONSplitter to split the file.
Path splitFilePath = getSplitsFilePath(filePath, getConf());
try {
loadSplitsFromSplitFile(inputFS.getFileStatus(filePath), splitFilePath);
} catch (NoSplitFileException e) {
LOG.info("did not find .splits file in " + splitFilePath.toUri());
setInputPath(filePath);
readSplits();
}
List<BSONFileSplit> splits = getAllSplits();
LOG.info("compressing " + splits.size() + " splits.");
byte[] buf = new byte[1024 * 1024];
for (int i = 0; i < splits.size(); ++i) {
// e.g., hdfs:///user/hive/warehouse/mongo/OutputFile-42.bz2
Path splitOutputPath = new Path(outputDirectory, filePath.getName() + "-" + i + codec.getDefaultExtension());
// Compress the split into a new file.
compressor = CodecPool.getCompressor(codec);
CompressionOutputStream compressionOutputStream = null;
try {
compressionOutputStream = codec.createOutputStream(outputFS.create(splitOutputPath), compressor);
int totalBytes = 0, bytesRead = 0;
BSONFileSplit split = splits.get(i);
inputStream.seek(split.getStart());
LOG.info("writing " + splitOutputPath.toUri() + ".");
while (totalBytes < split.getLength() && bytesRead >= 0) {
bytesRead = inputStream.read(buf, 0, (int) Math.min(buf.length, split.getLength() - totalBytes));
if (bytesRead > 0) {
compressionOutputStream.write(buf, 0, bytesRead);
totalBytes += bytesRead;
}
}
} finally {
if (compressionOutputStream != null) {
compressionOutputStream.close();
}
CodecPool.returnCompressor(compressor);
}
}
LOG.info("done.");
return 0;
}
Aggregations