use of org.apache.hadoop.io.compress.CompressionOutputStream in project hadoop by apache.
the class TestZStandardCompressorDecompressor method testCompressingWithOneByteOutputBuffer.
@Test
public void testCompressingWithOneByteOutputBuffer() throws Exception {
int uncompressedSize = (int) FileUtils.sizeOf(uncompressedFile);
byte[] bytes = FileUtils.readFileToByteArray(uncompressedFile);
assertEquals(uncompressedSize, bytes.length);
Configuration conf = new Configuration();
ZStandardCodec codec = new ZStandardCodec();
codec.setConf(conf);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Compressor compressor = new ZStandardCompressor(3, IO_FILE_BUFFER_SIZE_DEFAULT, 1);
CompressionOutputStream outputStream = codec.createOutputStream(baos, compressor);
for (byte aByte : bytes) {
outputStream.write(aByte);
}
outputStream.finish();
outputStream.close();
assertEquals(uncompressedSize, compressor.getBytesRead());
assertTrue(compressor.finished());
// just make sure we can decompress the file
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
Decompressor decompressor = codec.createDecompressor();
CompressionInputStream inputStream = codec.createInputStream(bais, decompressor);
byte[] buffer = new byte[100];
int n = buffer.length;
while ((n = inputStream.read(buffer, 0, n)) != -1) {
byteArrayOutputStream.write(buffer, 0, n);
}
assertArrayEquals(bytes, byteArrayOutputStream.toByteArray());
}
use of org.apache.hadoop.io.compress.CompressionOutputStream in project ignite by apache.
the class HadoopSnappyTest method checkSnappy.
/**
* Internal check routine.
*
* @throws Throwable If failed.
*/
public static void checkSnappy() throws Throwable {
try {
byte[] expBytes = new byte[BYTE_SIZE];
byte[] actualBytes = new byte[BYTE_SIZE];
for (int i = 0; i < expBytes.length; i++) expBytes[i] = (byte) ThreadLocalRandom.current().nextInt(16);
SnappyCodec codec = new SnappyCodec();
codec.setConf(new Configuration());
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try (CompressionOutputStream cos = codec.createOutputStream(baos)) {
cos.write(expBytes);
cos.flush();
}
try (CompressionInputStream cis = codec.createInputStream(new ByteArrayInputStream(baos.toByteArray()))) {
int read = cis.read(actualBytes, 0, actualBytes.length);
assert read == actualBytes.length;
}
assert Arrays.equals(expBytes, actualBytes);
} catch (Throwable e) {
System.out.println("Snappy check failed:");
System.out.println("### NativeCodeLoader.isNativeCodeLoaded: " + NativeCodeLoader.isNativeCodeLoaded());
System.out.println("### SnappyCompressor.isNativeCodeLoaded: " + SnappyCompressor.isNativeCodeLoaded());
throw e;
}
}
use of org.apache.hadoop.io.compress.CompressionOutputStream in project mongo-hadoop by mongodb.
the class BSONSplitter method run.
/**
* When run as a Tool, BSONSplitter can be used to pre-split and compress
* BSON files. This can be especially useful before uploading large BSON
* files to HDFS to save time. The compressed splits are written to the
* given output path or to the directory containing the input file, if
* the output path is unspecified. A ".splits" file is not generated, since
* each output file is expected to be its own split.
*
* @param args command-line arguments. Run with zero arguments to see usage.
* @return exit status
* @throws Exception
*/
@Override
public int run(final String[] args) throws Exception {
if (args.length < 1) {
printUsage();
return 1;
}
// Parse command-line arguments.
Path filePath = new Path(args[0]);
String compressorName = null, outputDirectoryStr = null;
Path outputDirectory;
CompressionCodec codec;
Compressor compressor;
for (int i = 1; i < args.length; ++i) {
if ("-c".equals(args[i]) && args.length > i) {
compressorName = args[++i];
} else if ("-o".equals(args[i]) && args.length > i) {
outputDirectoryStr = args[++i];
} else {
// CHECKSTYLE:OFF
System.err.println("unrecognized option: " + args[i]);
// CHECKSTYLE:ON
printUsage();
return 1;
}
}
// Supply default values for unspecified arguments.
if (null == outputDirectoryStr) {
outputDirectory = filePath.getParent();
} else {
outputDirectory = new Path(outputDirectoryStr);
}
if (null == compressorName) {
codec = new DefaultCodec();
} else {
Class<?> codecClass = Class.forName(compressorName);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, getConf());
}
if (codec instanceof Configurable) {
((Configurable) codec).setConf(getConf());
}
// Do not write a .splits file so as not to confuse BSONSplitter.
// Each compressed file will be its own split.
MongoConfigUtil.setBSONWriteSplits(getConf(), false);
// Open the file.
FileSystem inputFS = FileSystem.get(filePath.toUri(), getConf());
FileSystem outputFS = FileSystem.get(outputDirectory.toUri(), getConf());
FSDataInputStream inputStream = inputFS.open(filePath);
// Use BSONSplitter to split the file.
Path splitFilePath = getSplitsFilePath(filePath, getConf());
try {
loadSplitsFromSplitFile(inputFS.getFileStatus(filePath), splitFilePath);
} catch (NoSplitFileException e) {
LOG.info("did not find .splits file in " + splitFilePath.toUri());
setInputPath(filePath);
readSplits();
}
List<BSONFileSplit> splits = getAllSplits();
LOG.info("compressing " + splits.size() + " splits.");
byte[] buf = new byte[1024 * 1024];
for (int i = 0; i < splits.size(); ++i) {
// e.g., hdfs:///user/hive/warehouse/mongo/OutputFile-42.bz2
Path splitOutputPath = new Path(outputDirectory, filePath.getName() + "-" + i + codec.getDefaultExtension());
// Compress the split into a new file.
compressor = CodecPool.getCompressor(codec);
CompressionOutputStream compressionOutputStream = null;
try {
compressionOutputStream = codec.createOutputStream(outputFS.create(splitOutputPath), compressor);
int totalBytes = 0, bytesRead = 0;
BSONFileSplit split = splits.get(i);
inputStream.seek(split.getStart());
LOG.info("writing " + splitOutputPath.toUri() + ".");
while (totalBytes < split.getLength() && bytesRead >= 0) {
bytesRead = inputStream.read(buf, 0, (int) Math.min(buf.length, split.getLength() - totalBytes));
if (bytesRead > 0) {
compressionOutputStream.write(buf, 0, bytesRead);
totalBytes += bytesRead;
}
}
} finally {
if (compressionOutputStream != null) {
compressionOutputStream.close();
}
CodecPool.returnCompressor(compressor);
}
}
LOG.info("done.");
return 0;
}
use of org.apache.hadoop.io.compress.CompressionOutputStream in project hbase by apache.
the class DataBlockEncodingTool method benchmarkAlgorithm.
/**
* Check decompress performance of a given algorithm and print it.
* @param algorithm Compression algorithm.
* @param name Name of algorithm.
* @param buffer Buffer to be compressed.
* @param offset Position of the beginning of the data.
* @param length Length of data in buffer.
* @throws IOException
*/
public void benchmarkAlgorithm(Compression.Algorithm algorithm, String name, byte[] buffer, int offset, int length) throws IOException {
System.out.println(name + ":");
// compress it
List<Long> compressDurations = new ArrayList<>();
ByteArrayOutputStream compressedStream = new ByteArrayOutputStream();
CompressionOutputStream compressingStream = algorithm.createPlainCompressionStream(compressedStream, compressor);
try {
for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) {
final long startTime = System.nanoTime();
// The compressedStream should reset before compressingStream resetState since in GZ
// resetStatue will write header in the outputstream.
compressedStream.reset();
compressingStream.resetState();
compressingStream.write(buffer, offset, length);
compressingStream.flush();
compressedStream.toByteArray();
final long finishTime = System.nanoTime();
// add time record
if (itTime >= benchmarkNOmit) {
compressDurations.add(finishTime - startTime);
}
}
} catch (IOException e) {
throw new RuntimeException(String.format("Benchmark, or encoding algorithm '%s' cause some stream problems", name), e);
}
compressingStream.close();
printBenchmarkResult(length, compressDurations, Manipulation.COMPRESSION);
byte[] compBuffer = compressedStream.toByteArray();
// uncompress it several times and measure performance
List<Long> durations = new ArrayList<>();
for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) {
final long startTime = System.nanoTime();
byte[] newBuf = new byte[length + 1];
try {
ByteArrayInputStream downStream = new ByteArrayInputStream(compBuffer, 0, compBuffer.length);
InputStream decompressedStream = algorithm.createDecompressionStream(downStream, decompressor, 0);
int destOffset = 0;
int nextChunk;
while ((nextChunk = decompressedStream.available()) > 0) {
destOffset += decompressedStream.read(newBuf, destOffset, nextChunk);
}
decompressedStream.close();
} catch (IOException e) {
throw new RuntimeException(String.format("Decoding path in '%s' algorithm cause exception ", name), e);
}
final long finishTime = System.nanoTime();
// check correctness
if (0 != Bytes.compareTo(buffer, 0, length, newBuf, 0, length)) {
int prefix = 0;
for (; prefix < buffer.length && prefix < newBuf.length; ++prefix) {
if (buffer[prefix] != newBuf[prefix]) {
break;
}
}
throw new RuntimeException(String.format("Algorithm '%s' is corrupting the data", name));
}
// add time record
if (itTime >= benchmarkNOmit) {
durations.add(finishTime - startTime);
}
}
printBenchmarkResult(length, durations, Manipulation.DECOMPRESSION);
System.out.println();
}
use of org.apache.hadoop.io.compress.CompressionOutputStream in project brisk by riptano.
the class CompressionTests method testSnappyCompression.
@Test
public void testSnappyCompression() throws IOException {
SnappyCodec c = new SnappyCodec(new Configuration());
byte[] inmsg = new byte[1024 * 1024 * 10];
fillArray(inmsg);
byte[] buffer = new byte[1024 * 1024];
byte[] outmsg = new byte[1024 * 1024 * 16];
for (int k = 0; k < 64; k++) {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
CompressionOutputStream cout = c.createOutputStream(bout);
cout.write(inmsg);
cout.flush();
ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray());
CompressionInputStream cin = c.createInputStream(bin);
int totaln = 0;
while (cin.available() > 0) {
int n = cin.read(buffer);
if (n < 0)
break;
try {
System.arraycopy(buffer, 0, outmsg, totaln, n);
} catch (Throwable t) {
System.err.println("n = " + n + " totaln " + totaln);
throw new RuntimeException(t);
}
totaln += n;
}
assertEquals(inmsg.length, totaln);
for (int i = 0; i < inmsg.length; i++) {
assertEquals(inmsg[i], outmsg[i]);
}
assertEquals(new String(inmsg), new String(outmsg, 0, totaln));
}
}
Aggregations