use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testLongDictionary.
@Test
public void testLongDictionary() throws IOException {
int COUNT = 1000;
int COUNT2 = 2000;
final FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw = newPlainLongDictionaryValuesWriter(10000, 10000);
for (long i = 0; i < COUNT; i++) {
cw.writeLong(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (long i = COUNT2; i > 0; i--) {
cw.writeLong(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
DictionaryValuesReader cr = initDicReader(cw, PrimitiveTypeName.INT64);
cr.initFromPage(COUNT, bytes1.toInputStream());
for (long i = 0; i < COUNT; i++) {
long back = cr.readLong();
assertEquals(i % 50, back);
}
cr.initFromPage(COUNT2, bytes2.toInputStream());
for (long i = COUNT2; i > 0; i--) {
long back = cr.readLong();
assertEquals(i % 50, back);
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testSecondPageFallBack.
@Test
public void testSecondPageFallBack() throws IOException {
int COUNT = 1000;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
writeRepeated(COUNT, cw, "a");
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
writeDistinct(COUNT, cw, "b");
// not efficient so falls back
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
writeRepeated(COUNT, cw, "a");
// still plain because we fell back on previous page
BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
ValuesReader cr = initDicReader(cw, BINARY);
checkRepeated(COUNT, bytes1, cr, "a");
cr = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes2, cr, "b");
checkRepeated(COUNT, bytes3, cr, "a");
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDirectCodecFactory method test.
private void test(int size, CompressionCodecName codec, boolean useOnHeapCompression, Decompression decomp) {
ByteBuffer rawBuf = null;
ByteBuffer outBuf = null;
ByteBufferAllocator allocator = null;
try {
allocator = new DirectByteBufferAllocator();
final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(), allocator, pageSize);
rawBuf = allocator.allocate(size);
final byte[] rawArr = new byte[size];
outBuf = allocator.allocate(size * 2);
final Random r = new Random();
final byte[] random = new byte[1024];
int pos = 0;
while (pos < size) {
r.nextBytes(random);
rawBuf.put(random);
System.arraycopy(random, 0, rawArr, pos, random.length);
pos += random.length;
}
rawBuf.flip();
final DirectCodecFactory.BytesCompressor c = codecFactory.getCompressor(codec);
final CodecFactory.BytesDecompressor d = codecFactory.getDecompressor(codec);
final BytesInput compressed;
if (useOnHeapCompression) {
compressed = c.compress(BytesInput.from(rawArr));
} else {
compressed = c.compress(BytesInput.from(rawBuf));
}
switch(decomp) {
case OFF_HEAP:
{
final ByteBuffer buf = compressed.toByteBuffer();
final ByteBuffer b = allocator.allocate(buf.capacity());
try {
b.put(buf);
b.flip();
d.decompress(b, (int) compressed.size(), outBuf, size);
for (int i = 0; i < size; i++) {
Assert.assertTrue("Data didn't match at " + i, outBuf.get(i) == rawBuf.get(i));
}
} finally {
allocator.release(b);
}
break;
}
case OFF_HEAP_BYTES_INPUT:
{
final ByteBuffer buf = compressed.toByteBuffer();
final ByteBuffer b = allocator.allocate(buf.limit());
try {
b.put(buf);
b.flip();
final BytesInput input = d.decompress(BytesInput.from(b), size);
Assert.assertArrayEquals(String.format("While testing codec %s", codec), input.toByteArray(), rawArr);
} finally {
allocator.release(b);
}
break;
}
case ON_HEAP:
{
final byte[] buf = compressed.toByteArray();
final BytesInput input = d.decompress(BytesInput.from(buf), size);
Assert.assertArrayEquals(input.toByteArray(), rawArr);
break;
}
}
} catch (Exception e) {
final String msg = String.format("Failure while testing Codec: %s, OnHeapCompressionInput: %s, Decompression Mode: %s, Data Size: %d", codec.name(), useOnHeapCompression, decomp.name(), size);
System.out.println(msg);
throw new RuntimeException(msg, e);
} finally {
if (rawBuf != null) {
allocator.release(rawBuf);
}
if (outBuf != null) {
allocator.release(rawBuf);
}
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class ColumnWriterV2 method writePage.
/**
* writes the current data to a new page in the page store
* @param rowCount how many rows have been written so far
*/
public void writePage(long rowCount) {
int pageRowCount = Ints.checkedCast(rowCount - rowsWrittenSoFar);
this.rowsWrittenSoFar = rowCount;
if (DEBUG)
LOG.debug("write page");
try {
// TODO: rework this API. Those must be called *in that order*
BytesInput bytes = dataColumn.getBytes();
Encoding encoding = dataColumn.getEncoding();
pageWriter.writePageV2(pageRowCount, Ints.checkedCast(statistics.getNumNulls()), valueCount, path.getMaxRepetitionLevel() == 0 ? BytesInput.empty() : repetitionLevelColumn.toBytes(), path.getMaxDefinitionLevel() == 0 ? BytesInput.empty() : definitionLevelColumn.toBytes(), encoding, bytes, statistics);
} catch (IOException e) {
throw new ParquetEncodingException("could not write page for " + path, e);
}
repetitionLevelColumn.reset();
definitionLevelColumn.reset();
dataColumn.reset();
valueCount = 0;
resetStatistics();
}
Aggregations