Search in sources :

Example 26 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestDataPageV1Checksums method testWriteOnVerifyOff.

/**
 * Enable writing out page level crc checksum, disable verification in read path but check that
 * the crc checksums are correct. Tests whether we successfully write out correct crc checksums
 * without potentially failing on the read path verification .
 */
@Test
public void testWriteOnVerifyOff() throws IOException {
    Configuration conf = new Configuration();
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
    Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
    try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();
        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
        assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);
        DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
        assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
        assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);
        DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
        assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
        assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
        assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) PageReadStore(org.apache.parquet.column.page.PageReadStore) DataPageV1(org.apache.parquet.column.page.DataPageV1) Test(org.junit.Test)

Example 27 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestDataPageV1Checksums method testNestedWithNulls.

/**
 * Tests that we adhere to the checksum calculation specification, namely that the crc is
 * calculated using the compressed concatenation of the repetition levels, definition levels and
 * the actual data. This is done by generating sample data with a nested schema containing nulls
 * (generating non trivial repetition and definition levels).
 */
@Test
public void testNestedWithNulls() throws IOException {
    Configuration conf = new Configuration();
    // Write out sample file via the non-checksum code path, extract the raw bytes to calculate the
    // reference crc with
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
    Path refPath = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);
    try (ParquetFileReader refReader = getParquetFileReader(refPath, conf, Arrays.asList(colCIdDesc, colDValDesc))) {
        PageReadStore refPageReadStore = refReader.readNextRowGroup();
        byte[] colCIdPageBytes = readNextPage(colCIdDesc, refPageReadStore).getBytes().toByteArray();
        byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();
        // Write out sample file with checksums
        conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
        conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
        Path path = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);
        try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colCIdDesc, colDValDesc))) {
            PageReadStore pageReadStore = reader.readNextRowGroup();
            DataPageV1 colCIdPage = readNextPage(colCIdDesc, pageReadStore);
            assertCrcSetAndCorrect(colCIdPage, snappy(colCIdPageBytes));
            assertCorrectContent(colCIdPage.getBytes().toByteArray(), colCIdPageBytes);
            DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
            assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
            assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) PageReadStore(org.apache.parquet.column.page.PageReadStore) DataPageV1(org.apache.parquet.column.page.DataPageV1) Test(org.junit.Test)

Example 28 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteReadDataPageV2.

@Test
public void testWriteReadDataPageV2() throws Exception {
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
    w.start();
    w.startBlock(14);
    BytesInput repLevels = BytesInput.fromInt(2);
    BytesInput defLevels = BytesInput.fromInt(1);
    BytesInput data = BytesInput.fromInt(3);
    BytesInput data2 = BytesInput.fromInt(10);
    org.apache.parquet.column.statistics.Statistics<?> statsC1P1 = createStatistics("s", "z", C1);
    org.apache.parquet.column.statistics.Statistics<?> statsC1P2 = createStatistics("b", "d", C1);
    w.startColumn(C1, 6, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPageV2(4, 1, 3, repLevels, defLevels, PLAIN, data, 4, statsC1P1);
    w.writeDataPageV2(3, 0, 3, repLevels, defLevels, PLAIN, data, 4, statsC1P2);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 5, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPageV2(5, 2, 3, repLevels, defLevels, PLAIN, data2, 4, EMPTY_STATS);
    w.writeDataPageV2(2, 0, 2, repLevels, defLevels, PLAIN, data2, 4, EMPTY_STATS);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    w.end(new HashMap<>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    assertEquals("footer: " + readFooter, 1, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    // check for stats
    org.apache.parquet.column.statistics.Statistics<?> expectedStats = createStatistics("b", "z", C1);
    TestUtils.assertStatsValuesEqual(expectedStats, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
    try (ParquetFileReader reader = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)))) {
        PageReadStore pages = reader.readNextRowGroup();
        assertEquals(14, pages.getRowCount());
        validateV2Page(SCHEMA, pages, PATH1, 3, 4, 1, repLevels.toByteArray(), defLevels.toByteArray(), data.toByteArray(), 12);
        validateV2Page(SCHEMA, pages, PATH1, 3, 3, 0, repLevels.toByteArray(), defLevels.toByteArray(), data.toByteArray(), 12);
        validateV2Page(SCHEMA, pages, PATH2, 3, 5, 2, repLevels.toByteArray(), defLevels.toByteArray(), data2.toByteArray(), 12);
        validateV2Page(SCHEMA, pages, PATH2, 2, 2, 0, repLevels.toByteArray(), defLevels.toByteArray(), data2.toByteArray(), 12);
        assertNull(reader.readNextRowGroup());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesInput(org.apache.parquet.bytes.BytesInput) Encoding(org.apache.parquet.column.Encoding) PageReadStore(org.apache.parquet.column.page.PageReadStore) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) Test(org.junit.Test)

Example 29 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestParquetFileWriter method testAlignmentWithPadding.

@Test
public void testAlignmentWithPadding() throws Exception {
    File testFile = temp.newFile();
    Path path = new Path(testFile.toURI());
    Configuration conf = new Configuration();
    // Disable writing out checksums as hardcoded byte offsets in assertions below expect it
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
    // uses the test constructor
    ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 120, 60);
    w.start();
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    // should be 109
    long firstRowGroupEnds = w.getPos();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    long secondRowGroupEnds = w.getPos();
    w.end(new HashMap<String, String>());
    FileSystem fs = path.getFileSystem(conf);
    long fileLen = fs.getFileStatus(path).getLen();
    FSDataInputStream data = fs.open(path);
    // 4-byte offset + "PAR1"
    data.seek(fileLen - 8);
    long footerLen = BytesUtils.readIntLittleEndian(data);
    long startFooter = fileLen - footerLen - 8;
    assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
    ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
    // verify block starting positions with padding
    assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
    assertTrue("First row group should end before the block size (120)", firstRowGroupEnds < 120);
    assertEquals("Second row group should start at the block size", 120, readFooter.getBlocks().get(1).getStartingPos());
    {
        // read first block of col #1
        try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)))) {
            PageReadStore pages = r.readNextRowGroup();
            assertEquals(3, pages.getRowCount());
            validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
            validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
            assertNull(r.readNextRowGroup());
        }
    }
    {
        try (ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)))) {
            PageReadStore pages = r.readNextRowGroup();
            assertEquals(3, pages.getRowCount());
            validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
            validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
            validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
            validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
            validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
            pages = r.readNextRowGroup();
            assertEquals(4, pages.getRowCount());
            validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
            validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
            assertNull(r.readNextRowGroup());
        }
    }
    PrintFooter.main(new String[] { path.toString() });
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Encoding(org.apache.parquet.column.Encoding) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) Test(org.junit.Test)

Example 30 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class CompressionConverter method processBlocks.

public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema, String createdBy, CompressionCodecName codecName) throws IOException {
    int blockIndex = 0;
    PageReadStore store = reader.readNextRowGroup();
    while (store != null) {
        writer.startBlock(store.getRowCount());
        BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
        List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
        Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
        for (int i = 0; i < columnsInOrder.size(); i += 1) {
            ColumnChunkMetaData chunk = columnsInOrder.get(i);
            ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
            ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
            writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
            processChunk(reader, writer, chunk, createdBy, codecName);
            writer.endColumn();
        }
        writer.endBlock();
        store = reader.readNextRowGroup();
        blockIndex++;
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) Converter(org.apache.parquet.io.api.Converter) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) LoggerFactory(org.slf4j.LoggerFactory) DataPageHeaderV2(org.apache.parquet.format.DataPageHeaderV2) DictionaryPage(org.apache.parquet.column.page.DictionaryPage) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) Util(org.apache.parquet.format.Util) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) BytesInput(org.apache.parquet.bytes.BytesInput) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) Map(java.util.Map) InputFile(org.apache.parquet.io.InputFile) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) SeekableInputStream(org.apache.parquet.io.SeekableInputStream) PageReadStore(org.apache.parquet.column.page.PageReadStore) ParquetProperties(org.apache.parquet.column.ParquetProperties) Statistics(org.apache.parquet.column.statistics.Statistics) Logger(org.slf4j.Logger) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) GroupConverter(org.apache.parquet.io.api.GroupConverter) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) DataPageHeader(org.apache.parquet.format.DataPageHeader) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) ParquetReadOptions(org.apache.parquet.ParquetReadOptions) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) PageHeader(org.apache.parquet.format.PageHeader) ParquetEncodingException(org.apache.parquet.io.ParquetEncodingException) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath)

Aggregations

PageReadStore (org.apache.parquet.column.page.PageReadStore)31 Configuration (org.apache.hadoop.conf.Configuration)22 Path (org.apache.hadoop.fs.Path)22 IOException (java.io.IOException)14 MessageType (org.apache.parquet.schema.MessageType)14 Test (org.junit.Test)13 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)12 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)8 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 RecordReader (org.apache.parquet.io.RecordReader)7 DataPageV1 (org.apache.parquet.column.page.DataPageV1)6 Encoding (org.apache.parquet.column.Encoding)5 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)5 File (java.io.File)4 List (java.util.List)4 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4