Search in sources :

Example 6 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestDataPageV1Checksums method testWriteOffVerifyOff.

/**
 * Test that we do not write out checksums if the feature is turned off
 */
@Test
public void testWriteOffVerifyOff() throws IOException {
    Configuration conf = new Configuration();
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
    Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
    try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();
        assertCrcNotSet(readNextPage(colADesc, pageReadStore));
        assertCrcNotSet(readNextPage(colADesc, pageReadStore));
        assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
        assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) PageReadStore(org.apache.parquet.column.page.PageReadStore) Test(org.junit.Test)

Example 7 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestDataPageV1Checksums method testCorruptedPage.

/**
 * Test whether corruption in the page content is detected by checksum verification
 */
@Test
public void testCorruptedPage() throws IOException {
    Configuration conf = new Configuration();
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
    InputFile inputFile = HadoopInputFile.fromPath(path, conf);
    try (SeekableInputStream inputStream = inputFile.newStream()) {
        int fileLen = (int) inputFile.getLength();
        byte[] fileBytes = new byte[fileLen];
        inputStream.readFully(fileBytes);
        inputStream.close();
        // There are 4 pages in total (2 per column), we corrupt the first page of the first column
        // and the second page of the second column. We do this by altering a byte roughly in the
        // middle of each page to be corrupted
        fileBytes[fileLen / 8]++;
        fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;
        OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
        try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
            outputStream.write(fileBytes);
            outputStream.close();
            // First we disable checksum verification, the corruption will go undetected as it is in the
            // data section of the page
            conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
            try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
                PageReadStore pageReadStore = reader.readNextRowGroup();
                DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
                assertFalse("Data in page was not corrupted", Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
                readNextPage(colADesc, pageReadStore);
                readNextPage(colBDesc, pageReadStore);
                DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
                assertFalse("Data in page was not corrupted", Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
            }
            // Now we enable checksum verification, the corruption should be detected
            conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
            try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
                // We expect an exception on the first encountered corrupt page (in readAllPages)
                assertVerificationFailed(reader);
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) OutputFile(org.apache.parquet.io.OutputFile) Configuration(org.apache.hadoop.conf.Configuration) SeekableInputStream(org.apache.parquet.io.SeekableInputStream) PositionOutputStream(org.apache.parquet.io.PositionOutputStream) PageReadStore(org.apache.parquet.column.page.PageReadStore) DataPageV1(org.apache.parquet.column.page.DataPageV1) InputFile(org.apache.parquet.io.InputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) Test(org.junit.Test)

Example 8 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestParquetFileWriter method testAlignmentWithNoPaddingNeeded.

@Test
public void testAlignmentWithNoPaddingNeeded() throws Exception {
    File testFile = temp.newFile();
    Path path = new Path(testFile.toURI());
    Configuration conf = new Configuration();
    // Disable writing out checksums as hardcoded byte offsets in assertions below expect it
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
    // uses the test constructor
    ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 100, 50);
    w.start();
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    // should be 109
    long firstRowGroupEnds = w.getPos();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    long secondRowGroupEnds = w.getPos();
    w.end(new HashMap<String, String>());
    FileSystem fs = path.getFileSystem(conf);
    long fileLen = fs.getFileStatus(path).getLen();
    FSDataInputStream data = fs.open(path);
    // 4-byte offset + "PAR1"
    data.seek(fileLen - 8);
    long footerLen = BytesUtils.readIntLittleEndian(data);
    long startFooter = fileLen - footerLen - 8;
    assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
    ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
    // verify block starting positions with padding
    assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
    assertTrue("First row group should end before the block size (120)", firstRowGroupEnds > 100);
    assertEquals("Second row group should start after no padding", 109, readFooter.getBlocks().get(1).getStartingPos());
    {
        // read first block of col #1
        try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)))) {
            PageReadStore pages = r.readNextRowGroup();
            assertEquals(3, pages.getRowCount());
            validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
            validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
            assertNull(r.readNextRowGroup());
        }
    }
    {
        try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)))) {
            PageReadStore pages = r.readNextRowGroup();
            assertEquals(3, pages.getRowCount());
            validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
            validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
            validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
            validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
            validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
            pages = r.readNextRowGroup();
            assertEquals(4, pages.getRowCount());
            validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
            validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
            assertNull(r.readNextRowGroup());
        }
    }
    PrintFooter.main(new String[] { path.toString() });
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Encoding(org.apache.parquet.column.Encoding) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) Test(org.junit.Test)

Example 9 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class ColumnEncryptorTest method compareOffsetIndexes.

private void compareOffsetIndexes(TransParquetFileReader inReader, TransParquetFileReader outReader, ParquetMetadata inMetaData, ParquetMetadata outMetaData) throws IOException {
    PageReadStore inStore = inReader.readNextRowGroup();
    PageReadStore outStore = outReader.readNextRowGroup();
    int blockIndex = 0;
    while (inStore != null && outStore != null) {
        List<ColumnChunkMetaData> inColumns = inMetaData.getBlocks().get(blockIndex).getColumns();
        List<ColumnChunkMetaData> outColumns = outMetaData.getBlocks().get(blockIndex).getColumns();
        assertEquals(inColumns.size(), outColumns.size());
        validateColumns(inReader, outReader, inColumns, outColumns);
        inStore = inReader.readNextRowGroup();
        outStore = outReader.readNextRowGroup();
        blockIndex++;
        if (inStore != null || outStore != null) {
            throw new IOException("Number of row groups are not equal");
        }
    }
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) PageReadStore(org.apache.parquet.column.page.PageReadStore) IOException(java.io.IOException)

Example 10 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    statistics.incrementNumNulls(nullCount);
    statistics.setMinMaxFromBytes(new byte[] { 0, 1, 2 }, new byte[] { 0, 1, 2, 3 });
    long pageOffset;
    long pageSize;
    {
        OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
        ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
        writer.start();
        writer.startBlock(rowCount);
        pageOffset = outputFile.out().getPos();
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE);
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
            pageSize = outputFile.out().getPos() - pageOffset;
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        // Checking column/offset indexes for the one page
        ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
        ColumnIndex columnIndex = reader.readColumnIndex(column);
        assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
        assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
        assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
        assertFalse(columnIndex.getNullPages().get(0));
        OffsetIndex offsetIndex = reader.readOffsetIndex(column);
        assertEquals(1, offsetIndex.getPageCount());
        assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
        assertEquals(0, offsetIndex.getFirstRowIndex(0));
        assertEquals(pageOffset, offsetIndex.getOffset(0));
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Aggregations

PageReadStore (org.apache.parquet.column.page.PageReadStore)31 Configuration (org.apache.hadoop.conf.Configuration)22 Path (org.apache.hadoop.fs.Path)22 IOException (java.io.IOException)14 MessageType (org.apache.parquet.schema.MessageType)14 Test (org.junit.Test)13 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)12 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)8 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 RecordReader (org.apache.parquet.io.RecordReader)7 DataPageV1 (org.apache.parquet.column.page.DataPageV1)6 Encoding (org.apache.parquet.column.Encoding)5 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)5 File (java.io.File)4 List (java.util.List)4 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4