use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestDataPageV1Checksums method testWriteOffVerifyOff.
/**
* Test that we do not write out checksums if the feature is turned off
*/
@Test
public void testWriteOffVerifyOff() throws IOException {
Configuration conf = new Configuration();
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
assertCrcNotSet(readNextPage(colADesc, pageReadStore));
assertCrcNotSet(readNextPage(colADesc, pageReadStore));
assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestDataPageV1Checksums method testCorruptedPage.
/**
* Test whether corruption in the page content is detected by checksum verification
*/
@Test
public void testCorruptedPage() throws IOException {
Configuration conf = new Configuration();
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
InputFile inputFile = HadoopInputFile.fromPath(path, conf);
try (SeekableInputStream inputStream = inputFile.newStream()) {
int fileLen = (int) inputFile.getLength();
byte[] fileBytes = new byte[fileLen];
inputStream.readFully(fileBytes);
inputStream.close();
// There are 4 pages in total (2 per column), we corrupt the first page of the first column
// and the second page of the second column. We do this by altering a byte roughly in the
// middle of each page to be corrupted
fileBytes[fileLen / 8]++;
fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;
OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
outputStream.write(fileBytes);
outputStream.close();
// First we disable checksum verification, the corruption will go undetected as it is in the
// data section of the page
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
assertFalse("Data in page was not corrupted", Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
readNextPage(colADesc, pageReadStore);
readNextPage(colBDesc, pageReadStore);
DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
assertFalse("Data in page was not corrupted", Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
}
// Now we enable checksum verification, the corruption should be detected
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
// We expect an exception on the first encountered corrupt page (in readAllPages)
assertVerificationFailed(reader);
}
}
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestParquetFileWriter method testAlignmentWithNoPaddingNeeded.
@Test
public void testAlignmentWithNoPaddingNeeded() throws Exception {
File testFile = temp.newFile();
Path path = new Path(testFile.toURI());
Configuration conf = new Configuration();
// Disable writing out checksums as hardcoded byte offsets in assertions below expect it
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
// uses the test constructor
ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 100, 50);
w.start();
w.startBlock(3);
w.startColumn(C1, 5, CODEC);
long c1Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c1Ends = w.getPos();
w.startColumn(C2, 6, CODEC);
long c2Starts = w.getPos();
w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
long c2Ends = w.getPos();
w.endBlock();
// should be 109
long firstRowGroupEnds = w.getPos();
w.startBlock(4);
w.startColumn(C1, 7, CODEC);
w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.startColumn(C2, 8, CODEC);
w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
w.endBlock();
long secondRowGroupEnds = w.getPos();
w.end(new HashMap<String, String>());
FileSystem fs = path.getFileSystem(conf);
long fileLen = fs.getFileStatus(path).getLen();
FSDataInputStream data = fs.open(path);
// 4-byte offset + "PAR1"
data.seek(fileLen - 8);
long footerLen = BytesUtils.readIntLittleEndian(data);
long startFooter = fileLen - footerLen - 8;
assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
expectedEncoding.add(PLAIN);
expectedEncoding.add(BIT_PACKED);
assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
// verify block starting positions with padding
assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
assertTrue("First row group should end before the block size (120)", firstRowGroupEnds > 100);
assertEquals("Second row group should start after no padding", 109, readFooter.getBlocks().get(1).getStartingPos());
{
// read first block of col #1
try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)))) {
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
assertNull(r.readNextRowGroup());
}
}
{
try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)))) {
PageReadStore pages = r.readNextRowGroup();
assertEquals(3, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
pages = r.readNextRowGroup();
assertEquals(4, pages.getRowCount());
validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
assertNull(r.readNextRowGroup());
}
}
PrintFooter.main(new String[] { path.toString() });
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class ColumnEncryptorTest method compareOffsetIndexes.
private void compareOffsetIndexes(TransParquetFileReader inReader, TransParquetFileReader outReader, ParquetMetadata inMetaData, ParquetMetadata outMetaData) throws IOException {
PageReadStore inStore = inReader.readNextRowGroup();
PageReadStore outStore = outReader.readNextRowGroup();
int blockIndex = 0;
while (inStore != null && outStore != null) {
List<ColumnChunkMetaData> inColumns = inMetaData.getBlocks().get(blockIndex).getColumns();
List<ColumnChunkMetaData> outColumns = outMetaData.getBlocks().get(blockIndex).getColumns();
assertEquals(inColumns.size(), outColumns.size());
validateColumns(inReader, outReader, inColumns, outColumns);
inStore = inReader.readNextRowGroup();
outStore = outReader.readNextRowGroup();
blockIndex++;
if (inStore != null || outStore != null) {
throw new IOException("Number of row groups are not equal");
}
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method test.
@Test
public void test() throws Exception {
Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
Path root = file.getParent();
FileSystem fs = file.getFileSystem(conf);
if (fs.exists(root)) {
fs.delete(root, true);
}
fs.mkdirs(root);
MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
ColumnDescriptor col = schema.getColumns().get(0);
Encoding dataEncoding = PLAIN;
int valueCount = 10;
int d = 1;
int r = 2;
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
statistics.incrementNumNulls(nullCount);
statistics.setMinMaxFromBytes(new byte[] { 0, 1, 2 }, new byte[] { 0, 1, 2, 3 });
long pageOffset;
long pageSize;
{
OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
writer.start();
writer.startBlock(rowCount);
pageOffset = outputFile.out().getPos();
{
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE);
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
store.flushToFileWriter(writer);
pageSize = outputFile.out().getPos() - pageOffset;
}
writer.endBlock();
writer.end(new HashMap<String, String>());
}
{
ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
PageReadStore rowGroup = reader.readNextRowGroup();
PageReader pageReader = rowGroup.getPageReader(col);
DataPageV2 page = (DataPageV2) pageReader.readPage();
assertEquals(rowCount, page.getRowCount());
assertEquals(nullCount, page.getNullCount());
assertEquals(valueCount, page.getValueCount());
assertEquals(d, intValue(page.getDefinitionLevels()));
assertEquals(r, intValue(page.getRepetitionLevels()));
assertEquals(dataEncoding, page.getDataEncoding());
assertEquals(v, intValue(page.getData()));
// Checking column/offset indexes for the one page
ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
ColumnIndex columnIndex = reader.readColumnIndex(column);
assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
assertFalse(columnIndex.getNullPages().get(0));
OffsetIndex offsetIndex = reader.readOffsetIndex(column);
assertEquals(1, offsetIndex.getPageCount());
assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
assertEquals(0, offsetIndex.getFirstRowIndex(0));
assertEquals(pageOffset, offsetIndex.getOffset(0));
reader.close();
}
}
Aggregations