use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method test.
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
{
ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
writer.start();
writer.startBlock(rowCount);
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
store.flushToFileWriter(writer);
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
assertEquals(statistics.toString(), page.getStatistics().toString());
reader.close();
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class DeltaBinaryPackingValuesWriterForIntegerTest method shouldConsumePageDataInInitialization.
@Test
public void shouldConsumePageDataInInitialization() throws IOException {
int[] data = new int[2 * blockSize + 3];
for (int i = 0; i < data.length; i++) {
data[i] = i * 32;
}
writeData(data);
reader = new DeltaBinaryPackingValuesReader();
BytesInput bytes = writer.getBytes();
byte[] valueContent = bytes.toByteArray();
byte[] pageContent = new byte[valueContent.length * 10];
int contentOffsetInPage = 33;
System.arraycopy(valueContent, 0, pageContent, contentOffsetInPage, valueContent.length);
// offset should be correct
ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.wrap(pageContent));
stream.skipFully(contentOffsetInPage);
reader.initFromPage(100, stream);
long offset = stream.position();
assertEquals(valueContent.length + contentOffsetInPage, offset);
// should be able to read data correctly
for (int i : data) {
assertEquals(i, reader.readInteger());
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class DeltaBinaryPackingValuesWriterForLongTest method shouldReturnCorrectOffsetAfterInitialization.
@Test
public void shouldReturnCorrectOffsetAfterInitialization() throws IOException {
long[] data = new long[2 * blockSize + 3];
for (int i = 0; i < data.length; i++) {
data[i] = i * 32;
}
writeData(data);
reader = new DeltaBinaryPackingValuesReader();
BytesInput bytes = writer.getBytes();
byte[] valueContent = bytes.toByteArray();
byte[] pageContent = new byte[valueContent.length * 10];
int contentOffsetInPage = 33;
System.arraycopy(valueContent, 0, pageContent, contentOffsetInPage, valueContent.length);
// offset should be correct
ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.wrap(pageContent));
stream.skipFully(contentOffsetInPage);
reader.initFromPage(100, stream);
long offset = stream.position();
assertEquals(valueContent.length + contentOffsetInPage, offset);
// should be able to read data correctly
for (long i : data) {
assertEquals(i, reader.readLong());
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class ParquetFileReader method readCompressedDictionary.
private DictionaryPage readCompressedDictionary(PageHeader pageHeader, SeekableInputStream fin) throws IOException {
DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();
int uncompressedPageSize = pageHeader.getUncompressed_page_size();
int compressedPageSize = pageHeader.getCompressed_page_size();
byte[] dictPageBytes = new byte[compressedPageSize];
fin.readFully(dictPageBytes);
BytesInput bin = BytesInput.from(dictPageBytes);
return new DictionaryPage(bin, uncompressedPageSize, dictHeader.getNum_values(), converter.getEncoding(dictHeader.getEncoding()));
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testFirstPageFallBack.
@Test
public void testFirstPageFallBack() throws IOException {
int COUNT = 1000;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(10000, 10000);
writeDistinct(COUNT, cw, "a");
// not efficient so falls back
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN);
writeRepeated(COUNT, cw, "b");
// still plain because we fell back on first page
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
ValuesReader cr = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes1, cr, "a");
checkRepeated(COUNT, bytes2, cr, "b");
}
Aggregations