Search in sources :

Example 1 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestParquetFileWriter method testAlignmentWithNoPaddingNeeded.

@Test
public void testAlignmentWithNoPaddingNeeded() throws Exception {
    File testFile = temp.newFile();
    Path path = new Path(testFile.toURI());
    Configuration conf = new Configuration();
    // uses the test constructor
    ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 100, 50);
    w.start();
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    // should be 109
    long firstRowGroupEnds = w.getPos();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    long secondRowGroupEnds = w.getPos();
    w.end(new HashMap<String, String>());
    FileSystem fs = path.getFileSystem(conf);
    long fileLen = fs.getFileStatus(path).getLen();
    FSDataInputStream data = fs.open(path);
    // 4-byte offset + "PAR1"
    data.seek(fileLen - 8);
    long footerLen = BytesUtils.readIntLittleEndian(data);
    long startFooter = fileLen - footerLen - 8;
    assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
    ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
    // verify block starting positions with padding
    assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
    assertTrue("First row group should end before the block size (120)", firstRowGroupEnds > 100);
    assertEquals("Second row group should start after no padding", 109, readFooter.getBlocks().get(1).getStartingPos());
    {
        // read first block of col #1
        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        assertNull(r.readNextRowGroup());
    }
    {
        // read all blocks of col #1 and #2
        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
        pages = r.readNextRowGroup();
        assertEquals(4, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
        validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
        assertNull(r.readNextRowGroup());
    }
    PrintFooter.main(new String[] { path.toString() });
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Encoding(org.apache.parquet.column.Encoding) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) File(java.io.File) Test(org.junit.Test)

Example 2 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class ShowPagesCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
    MessageType schema = reader.getFileMetaData().getSchema();
    Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
    if (this.columns == null || this.columns.isEmpty()) {
        for (ColumnDescriptor descriptor : schema.getColumns()) {
            columns.put(descriptor, primitive(schema, descriptor.getPath()));
        }
    } else {
        for (String column : this.columns) {
            columns.put(descriptor(column, schema), primitive(column, schema));
        }
    }
    CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
    // accumulate formatted lines to print by column
    Map<String, List<String>> formatted = Maps.newLinkedHashMap();
    PageFormatter formatter = new PageFormatter();
    PageReadStore pageStore;
    int rowGroupNum = 0;
    while ((pageStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor descriptor : columns.keySet()) {
            List<String> lines = formatted.get(columnName(descriptor));
            if (lines == null) {
                lines = Lists.newArrayList();
                formatted.put(columnName(descriptor), lines);
            }
            formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
            PageReader pages = pageStore.getPageReader(descriptor);
            DictionaryPage dict = pages.readDictionaryPage();
            if (dict != null) {
                lines.add(formatter.format(dict));
            }
            DataPage page;
            while ((page = pages.readPage()) != null) {
                lines.add(formatter.format(page));
            }
        }
        rowGroupNum += 1;
    }
    // TODO: Show total column size and overall size per value in the column summary line
    for (String columnName : formatted.keySet()) {
        console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-')));
        console.info(formatter.getHeader());
        for (String line : formatted.get(columnName)) {
            console.info(line);
        }
        console.info("");
    }
    return 0;
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingAsString(org.apache.parquet.cli.Util.encodingAsString) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PageReadStore(org.apache.parquet.column.page.PageReadStore) PrimitiveType(org.apache.parquet.schema.PrimitiveType) List(java.util.List) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 3 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteRead.

@Test
public void testWriteRead() throws Exception {
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
    w.start();
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
    {
        // read first block of col #1
        ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        assertNull(r.readNextRowGroup());
    }
    {
        // read all blocks of col #1 and #2
        ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
        pages = r.readNextRowGroup();
        assertEquals(4, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
        validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
        assertNull(r.readNextRowGroup());
    }
    PrintFooter.main(new String[] { path.toString() });
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Encoding(org.apache.parquet.column.Encoding) PageReadStore(org.apache.parquet.column.page.PageReadStore) File(java.io.File) Test(org.junit.Test)

Example 4 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    {
        ParquetFileWriter writer = new ParquetFileWriter(conf, schema, file);
        writer.start();
        writer.startBlock(rowCount);
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator());
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        assertEquals(statistics.toString(), page.getStatistics().toString());
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 5 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class CheckParquet251Command method check.

private String check(String file) throws IOException {
    Path path = qualifiedPath(file);
    ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
    FileMetaData meta = footer.getFileMetaData();
    String createdBy = meta.getCreatedBy();
    if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
        // create fake metadata that will read corrupt stats and return them
        FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
        // get just the binary columns
        List<ColumnDescriptor> columns = Lists.newArrayList();
        Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {

            @Override
            public boolean apply(@Nullable ColumnDescriptor input) {
                return input != null && input.getType() == BINARY;
            }
        }));
        // now check to see if the data is actually corrupt
        ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
        try {
            PageStatsValidator validator = new PageStatsValidator();
            for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
                validator.validate(columns, pages);
            }
        } catch (BadStatsException e) {
            return e.getMessage();
        }
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) Nullable(javax.annotation.Nullable) Predicate(com.google.common.base.Predicate)

Aggregations

PageReadStore (org.apache.parquet.column.page.PageReadStore)10 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 Path (org.apache.hadoop.fs.Path)5 Configuration (org.apache.hadoop.conf.Configuration)4 Encoding (org.apache.parquet.column.Encoding)4 Test (org.junit.Test)4 File (java.io.File)3 IOException (java.io.IOException)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)3 MessageType (org.apache.parquet.schema.MessageType)3 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 PageReader (org.apache.parquet.column.page.PageReader)2 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)2 PrimitiveType (org.apache.parquet.schema.PrimitiveType)2 Predicate (com.google.common.base.Predicate)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Nullable (javax.annotation.Nullable)1 BytesInput (org.apache.parquet.bytes.BytesInput)1