use of org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter in project parquet-mr by apache.
the class BenchmarkDeltaByteArray method benchmarkRandomStringsWithDeltaLengthByteArrayValuesWriter.
@BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 4)
@Test
public void benchmarkRandomStringsWithDeltaLengthByteArrayValuesWriter() throws IOException {
DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
DeltaByteArrayReader reader = new DeltaByteArrayReader();
Utils.writeData(writer, values);
ByteBufferInputStream data = writer.getBytes().toInputStream();
Binary[] bin = Utils.readData(reader, data, values.length);
System.out.println("size " + data.position());
}
use of org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testOldReassemblyWithoutCorruption.
@Test
public void testOldReassemblyWithoutCorruption() throws Exception {
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
for (int i = 0; i < 10; i += 1) {
writer.writeBytes(Binary.fromString(str(i)));
}
ByteBuffer firstPageBytes = writer.getBytes().toByteBuffer();
// sets previous to new byte[0]
writer.reset();
for (int i = 10; i < 20; i += 1) {
writer.writeBytes(Binary.fromString(str(i)));
}
ByteBuffer secondPageBytes = writer.getBytes().toByteBuffer();
DeltaByteArrayReader firstPageReader = new DeltaByteArrayReader();
firstPageReader.initFromPage(10, ByteBufferInputStream.wrap(firstPageBytes));
for (int i = 0; i < 10; i += 1) {
assertEquals(firstPageReader.readBytes().toStringUsingUTF8(), str(i));
}
DeltaByteArrayReader secondPageReader = new DeltaByteArrayReader();
secondPageReader.initFromPage(10, ByteBufferInputStream.wrap(secondPageBytes));
for (int i = 10; i < 20; i += 1) {
assertEquals(secondPageReader.readBytes().toStringUsingUTF8(), str(i));
}
}
use of org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.
@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
MemPageStore pages = new MemPageStore(0);
PageWriter memWriter = pages.getPageWriter(column);
ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
// get generic repetition and definition level bytes to use for pages
ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
for (int i = 0; i < 10; i += 1) {
rdValues.writeInteger(0);
}
// use a byte array backed BytesInput because it is reused
BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
String lastValue = null;
List<String> values = new ArrayList<String>();
for (int i = 0; i < 10; i += 1) {
lastValue = str(i);
writer.writeBytes(Binary.fromString(lastValue));
values.add(lastValue);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
// sets previous to new byte[0]
writer.reset();
corruptWriter(writer, lastValue);
for (int i = 10; i < 20; i += 1) {
String value = str(i);
writer.writeBytes(Binary.fromString(value));
values.add(value);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
final List<String> actualValues = new ArrayList<String>();
PrimitiveConverter converter = new PrimitiveConverter() {
@Override
public void addBinary(Binary value) {
actualValues.add(value.toStringUsingUTF8());
}
};
ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
while (actualValues.size() < columnReader.getTotalValueCount()) {
columnReader.writeCurrentValueToConverter();
columnReader.consume();
}
Assert.assertEquals(values, actualValues);
}
use of org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testReassemblyWithoutCorruption.
@Test
public void testReassemblyWithoutCorruption() throws Exception {
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
for (int i = 0; i < 10; i += 1) {
writer.writeBytes(Binary.fromString(str(i)));
}
ByteBuffer firstPageBytes = writer.getBytes().toByteBuffer();
// sets previous to new byte[0]
writer.reset();
for (int i = 10; i < 20; i += 1) {
writer.writeBytes(Binary.fromString(str(i)));
}
ByteBuffer secondPageBytes = writer.getBytes().toByteBuffer();
DeltaByteArrayReader firstPageReader = new DeltaByteArrayReader();
firstPageReader.initFromPage(10, ByteBufferInputStream.wrap(firstPageBytes));
for (int i = 0; i < 10; i += 1) {
assertEquals(firstPageReader.readBytes().toStringUsingUTF8(), str(i));
}
DeltaByteArrayReader secondPageReader = new DeltaByteArrayReader();
secondPageReader.initFromPage(10, ByteBufferInputStream.wrap(secondPageBytes));
secondPageReader.setPreviousReader(firstPageReader);
for (int i = 10; i < 20; i += 1) {
assertEquals(secondPageReader.readBytes().toStringUsingUTF8(), str(i));
}
}
use of org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testReassemblyWithCorruptPage.
@Test
public void testReassemblyWithCorruptPage() throws Exception {
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
String lastValue = null;
for (int i = 0; i < 10; i += 1) {
lastValue = str(i);
writer.writeBytes(Binary.fromString(lastValue));
}
ByteBuffer firstPageBytes = writer.getBytes().toByteBuffer();
// sets previous to new byte[0]
writer.reset();
corruptWriter(writer, lastValue);
for (int i = 10; i < 20; i += 1) {
writer.writeBytes(Binary.fromString(str(i)));
}
ByteBuffer corruptPageBytes = writer.getBytes().toByteBuffer();
DeltaByteArrayReader firstPageReader = new DeltaByteArrayReader();
firstPageReader.initFromPage(10, ByteBufferInputStream.wrap(firstPageBytes));
for (int i = 0; i < 10; i += 1) {
assertEquals(str(i), firstPageReader.readBytes().toStringUsingUTF8());
}
DeltaByteArrayReader corruptPageReader = new DeltaByteArrayReader();
corruptPageReader.initFromPage(10, ByteBufferInputStream.wrap(corruptPageBytes));
try {
corruptPageReader.readBytes();
fail("Corrupt page did not throw an exception when read");
} catch (ArrayIndexOutOfBoundsException e) {
// expected, this is a corrupt page
}
DeltaByteArrayReader secondPageReader = new DeltaByteArrayReader();
secondPageReader.initFromPage(10, ByteBufferInputStream.wrap(corruptPageBytes));
secondPageReader.setPreviousReader(firstPageReader);
for (int i = 10; i < 20; i += 1) {
assertEquals(secondPageReader.readBytes().toStringUsingUTF8(), str(i));
}
}
Aggregations