use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestDataPageV1Checksums method testWriteOnVerifyOff.
/**
* Enable writing out page level crc checksum, disable verification in read path but check that
* the crc checksums are correct. Tests whether we successfully write out correct crc checksums
* without potentially failing on the read path verification .
*/
@Test
public void testWriteOnVerifyOff() throws IOException {
Configuration conf = new Configuration();
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);
DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);
DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);
DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestDataPageV1Checksums method testNestedWithNulls.
/**
* Tests that we adhere to the checksum calculation specification, namely that the crc is
* calculated using the compressed concatenation of the repetition levels, definition levels and
* the actual data. This is done by generating sample data with a nested schema containing nulls
* (generating non trivial repetition and definition levels).
*/
@Test
public void testNestedWithNulls() throws IOException {
Configuration conf = new Configuration();
// Write out sample file via the non-checksum code path, extract the raw bytes to calculate the
// reference crc with
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
Path refPath = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);
try (ParquetFileReader refReader = getParquetFileReader(refPath, conf, Arrays.asList(colCIdDesc, colDValDesc))) {
PageReadStore refPageReadStore = refReader.readNextRowGroup();
byte[] colCIdPageBytes = readNextPage(colCIdDesc, refPageReadStore).getBytes().toByteArray();
byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();
// Write out sample file with checksums
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
Path path = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colCIdDesc, colDValDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
DataPageV1 colCIdPage = readNextPage(colCIdDesc, pageReadStore);
assertCrcSetAndCorrect(colCIdPage, snappy(colCIdPageBytes));
assertCorrectContent(colCIdPage.getBytes().toByteArray(), colCIdPageBytes);
DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
}
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestParquetFileWriter method testWriteReadDataPageV2.
@Test
public void testWriteReadDataPageV2() throws Exception {
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
w.start();
w.startBlock(14);
BytesInput repLevels = BytesInput.fromInt(2);
BytesInput defLevels = BytesInput.fromInt(1);
BytesInput data = BytesInput.fromInt(3);
BytesInput data2 = BytesInput.fromInt(10);
org.apache.parquet.column.statistics.Statistics<?> statsC1P1 = createStatistics("s", "z", C1);
org.apache.parquet.column.statistics.Statistics<?> statsC1P2 = createStatistics("b", "d", C1);
w.startColumn(C1, 6, CODEC);
long c1Starts = w.getPos();
w.writeDataPageV2(4, 1, 3, repLevels, defLevels, PLAIN, data, 4, statsC1P1);
w.writeDataPageV2(3, 0, 3, repLevels, defLevels, PLAIN, data, 4, statsC1P2);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 5, CODEC);
long c2Starts = w.getPos();
w.writeDataPageV2(5, 2, 3, repLevels, defLevels, PLAIN, data2, 4, EMPTY_STATS);
w.writeDataPageV2(2, 0, 2, repLevels, defLevels, PLAIN, data2, 4, EMPTY_STATS);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
w.end(new HashMap<>());
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
assertEquals("footer: " + readFooter, 1, readFooter.getBlocks().size());
assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
// check for stats
org.apache.parquet.column.statistics.Statistics<?> expectedStats = createStatistics("b", "z", C1);
TestUtils.assertStatsValuesEqual(expectedStats, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
expectedEncoding.add(PLAIN);
assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
try (ParquetFileReader reader = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)))) {
PageReadStore pages = reader.readNextRowGroup();
assertEquals(14, pages.getRowCount());
validateV2Page(SCHEMA, pages, PATH1, 3, 4, 1, repLevels.toByteArray(), defLevels.toByteArray(), data.toByteArray(), 12);
validateV2Page(SCHEMA, pages, PATH1, 3, 3, 0, repLevels.toByteArray(), defLevels.toByteArray(), data.toByteArray(), 12);
validateV2Page(SCHEMA, pages, PATH2, 3, 5, 2, repLevels.toByteArray(), defLevels.toByteArray(), data2.toByteArray(), 12);
validateV2Page(SCHEMA, pages, PATH2, 2, 2, 0, repLevels.toByteArray(), defLevels.toByteArray(), data2.toByteArray(), 12);
assertNull(reader.readNextRowGroup());
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestParquetFileWriter method testAlignmentWithPadding.
@Test
public void testAlignmentWithPadding() throws Exception {
File testFile = temp.newFile();
Path path = new Path(testFile.toURI());
Configuration conf = new Configuration();
// Disable writing out checksums as hardcoded byte offsets in assertions below expect it
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
// uses the test constructor
ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 120, 60);
w.start();
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
// should be 109
long firstRowGroupEnds = w.getPos();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
long secondRowGroupEnds = w.getPos();
w.end(new HashMap<String, String>());
FileSystem fs = path.getFileSystem(conf);
long fileLen = fs.getFileStatus(path).getLen();
FSDataInputStream data = fs.open(path);
// 4-byte offset + "PAR1"
data.seek(fileLen - 8);
long footerLen = BytesUtils.readIntLittleEndian(data);
long startFooter = fileLen - footerLen - 8;
assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
expectedEncoding.add(PLAIN);
expectedEncoding.add(BIT_PACKED);
assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
// verify block starting positions with padding
assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
assertTrue("First row group should end before the block size (120)", firstRowGroupEnds < 120);
assertEquals("Second row group should start at the block size", 120, readFooter.getBlocks().get(1).getStartingPos());
{
// read first block of col #1
try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)))) {
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
assertNull(r.readNextRowGroup());
}
}
{
try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)))) {
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
pages = r.readNextRowGroup();
assertEquals(4, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
assertNull(r.readNextRowGroup());
}
}
PrintFooter.main(new String[] { path.toString() });
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class CompressionConverter method processBlocks.
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, String createdBy, CompressionCodecName codecName) throws IOException {
int blockIndex = 0;
PageReadStore store = reader.readNextRowGroup();
while (store != null) {
writer.startBlock(store.getRowCount());
BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
for (int i = 0; i < columnsInOrder.size(); i += 1) {
ColumnChunkMetaData chunk = columnsInOrder.get(i);
ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
processChunk(reader, writer, chunk, createdBy, codecName);
writer.endColumn();
}
writer.endBlock();
store = reader.readNextRowGroup();
blockIndex++;
}
}
Aggregations