Search in sources :

Example 1 with Encoding

use of org.apache.parquet.column.Encoding in project drill by axbaretto.

the class PageReader method next.

/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws IOException
 */
public boolean next() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    currentPageCount = -1;
    valuesRead = 0;
    valuesReadyToRead = 0;
    // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
    // and submit a bug report
    long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
    if (parentColumnReader.totalValuesRead >= totalValueCount) {
        return false;
    }
    clearBuffers();
    nextInternal();
    if (pageData == null || pageHeader == null) {
        // TODO: Is this an error condition or a normal condition??
        return false;
    }
    timer.start();
    currentPageCount = pageHeader.data_page_header.num_values;
    final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
    final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
    final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
    byteLength = pageHeader.uncompressed_page_size;
    final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
    readPosInBytes = 0;
    if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
        repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
        repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
        // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
        // read the first zero here to simplify the reading processes, and start reading the first value the same as all
        // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
        // the first list of repetition levels
        readPosInBytes = repetitionLevels.getNextOffset();
        repetitionLevels.readInteger();
    }
    if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
        parentColumnReader.currDefLevel = -1;
        definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
        definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        readPosInBytes = definitionLevels.getNextOffset();
        if (!valueEncoding.usesDictionary()) {
            valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
            valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        }
    }
    if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
        valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
        valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    }
    if (valueEncoding.usesDictionary()) {
        // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
        // actually copying the values out into the vectors
        dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
        dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        dictionaryValueReader = new DictionaryValuesReader(dictionary);
        dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        parentColumnReader.usingDictionary = true;
    } else {
        parentColumnReader.usingDictionary = false;
    }
    // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
    // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
    // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
    // definition and repetition level data which is stored alongside the page data itself
    readyToReadPosInBytes = readPosInBytes;
    long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
    stats.numDataPagesDecoded.incrementAndGet();
    stats.timeDataPageDecode.addAndGet(timeDecode);
    return true;
}
Also used : Stopwatch(com.google.common.base.Stopwatch) Encoding(org.apache.parquet.column.Encoding) DictionaryValuesReader(org.apache.parquet.column.values.dictionary.DictionaryValuesReader) ByteBuffer(java.nio.ByteBuffer)

Example 2 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class DictionaryPageReader method hasDictionaryPage.

private boolean hasDictionaryPage(ColumnChunkMetaData column) {
    EncodingStats stats = column.getEncodingStats();
    if (stats != null) {
        // ensure there is a dictionary page and that it is used to encode data pages
        return stats.hasDictionaryPages() && stats.hasDictionaryEncodedPages();
    }
    Set<Encoding> encodings = column.getEncodings();
    return (encodings.contains(PLAIN_DICTIONARY) || encodings.contains(RLE_DICTIONARY));
}
Also used : EncodingStats(org.apache.parquet.column.EncodingStats) Encoding(org.apache.parquet.column.Encoding)

Example 3 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class TestParquetWriterNewPage method test.

@Test
public void test() throws Exception {
    Configuration conf = new Configuration();
    Path root = new Path("target/tests/TestParquetWriter/");
    FileSystem fs = root.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = parseMessageType("message test { " + "required binary binary_field; " + "required int32 int32_field; " + "required int64 int64_field; " + "required boolean boolean_field; " + "required float float_field; " + "required double double_field; " + "required fixed_len_byte_array(3) flba_field; " + "required int96 int96_field; " + "optional binary null_field; " + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    Map<String, Encoding> expected = new HashMap<String, Encoding>();
    expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
    expected.put("1000-" + PARQUET_1_0, PLAIN);
    expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
    expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
    for (int modulo : asList(10, 1000)) {
        for (WriterVersion version : WriterVersion.values()) {
            Path file = new Path(root, version.name() + "_" + modulo);
            ParquetWriter<Group> writer = new ParquetWriter<Group>(file, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
            for (int i = 0; i < 1000; i++) {
                writer.write(f.newGroup().append("binary_field", "test" + (i % modulo)).append("int32_field", 32).append("int64_field", 64l).append("boolean_field", true).append("float_field", 1.0f).append("double_field", 2.0d).append("flba_field", "foo").append("int96_field", Binary.fromConstantByteArray(new byte[12])));
            }
            writer.close();
            ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
            for (int i = 0; i < 1000; i++) {
                Group group = reader.read();
                assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
                assertEquals(32, group.getInteger("int32_field", 0));
                assertEquals(64l, group.getLong("int64_field", 0));
                assertEquals(true, group.getBoolean("boolean_field", 0));
                assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
                assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
                assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
                assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field", 0));
                assertEquals(0, group.getFieldRepetitionCount("null_field"));
            }
            reader.close();
            ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
            for (BlockMetaData blockMetaData : footer.getBlocks()) {
                for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
                    if (column.getPath().toDotString().equals("binary_field")) {
                        String key = modulo + "-" + version;
                        Encoding expectedEncoding = expected.get(key);
                        assertTrue(key + ":" + column.getEncodings() + " should contain " + expectedEncoding, column.getEncodings().contains(expectedEncoding));
                    }
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Encoding(org.apache.parquet.column.Encoding) WriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) FileSystem(org.apache.hadoop.fs.FileSystem) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 4 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class TestParquetFileWriter method testAlignmentWithNoPaddingNeeded.

@Test
public void testAlignmentWithNoPaddingNeeded() throws Exception {
    File testFile = temp.newFile();
    Path path = new Path(testFile.toURI());
    Configuration conf = new Configuration();
    // uses the test constructor
    ParquetFileWriter w = new ParquetFileWriter(conf, SCHEMA, path, 100, 50);
    w.start();
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    // should be 109
    long firstRowGroupEnds = w.getPos();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    long secondRowGroupEnds = w.getPos();
    w.end(new HashMap<String, String>());
    FileSystem fs = path.getFileSystem(conf);
    long fileLen = fs.getFileStatus(path).getLen();
    FSDataInputStream data = fs.open(path);
    // 4-byte offset + "PAR1"
    data.seek(fileLen - 8);
    long footerLen = BytesUtils.readIntLittleEndian(data);
    long startFooter = fileLen - footerLen - 8;
    assertEquals("Footer should start after second row group without padding", secondRowGroupEnds, startFooter);
    ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
    // verify block starting positions with padding
    assertEquals("First row group should start after magic", 4, readFooter.getBlocks().get(0).getStartingPos());
    assertTrue("First row group should end before the block size (120)", firstRowGroupEnds > 100);
    assertEquals("Second row group should start after no padding", 109, readFooter.getBlocks().get(1).getStartingPos());
    {
        // read first block of col #1
        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        assertNull(r.readNextRowGroup());
    }
    {
        // read all blocks of col #1 and #2
        ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
        pages = r.readNextRowGroup();
        assertEquals(4, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
        validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
        assertNull(r.readNextRowGroup());
    }
    PrintFooter.main(new String[] { path.toString() });
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Encoding(org.apache.parquet.column.Encoding) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) File(java.io.File) Test(org.junit.Test)

Example 5 with Encoding

use of org.apache.parquet.column.Encoding in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteRead.

@Test
public void testWriteRead() throws Exception {
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
    w.start();
    w.startBlock(3);
    w.startColumn(C1, 5, CODEC);
    long c1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
    assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
    assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
    assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
    HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
    expectedEncoding.add(PLAIN);
    expectedEncoding.add(BIT_PACKED);
    assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());
    {
        // read first block of col #1
        ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(SCHEMA.getColumnDescription(PATH1)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        assertNull(r.readNextRowGroup());
    }
    {
        // read all blocks of col #1 and #2
        ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, readFooter.getBlocks(), Arrays.asList(SCHEMA.getColumnDescription(PATH1), SCHEMA.getColumnDescription(PATH2)));
        PageReadStore pages = r.readNextRowGroup();
        assertEquals(3, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 2, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH1, 3, BytesInput.from(BYTES1));
        validateContains(SCHEMA, pages, PATH2, 2, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 3, BytesInput.from(BYTES2));
        validateContains(SCHEMA, pages, PATH2, 1, BytesInput.from(BYTES2));
        pages = r.readNextRowGroup();
        assertEquals(4, pages.getRowCount());
        validateContains(SCHEMA, pages, PATH1, 7, BytesInput.from(BYTES3));
        validateContains(SCHEMA, pages, PATH2, 8, BytesInput.from(BYTES4));
        assertNull(r.readNextRowGroup());
    }
    PrintFooter.main(new String[] { path.toString() });
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Encoding(org.apache.parquet.column.Encoding) PageReadStore(org.apache.parquet.column.page.PageReadStore) File(java.io.File) Test(org.junit.Test)

Aggregations

Encoding (org.apache.parquet.column.Encoding)21 Path (org.apache.hadoop.fs.Path)6 Test (org.junit.Test)6 Configuration (org.apache.hadoop.conf.Configuration)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 EncodingStats (org.apache.parquet.column.EncodingStats)4 PageReadStore (org.apache.parquet.column.page.PageReadStore)4 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)4 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)4 File (java.io.File)3 IOException (java.io.IOException)3 HashMap (java.util.HashMap)3 DrillRuntimeException (org.apache.drill.common.exceptions.DrillRuntimeException)3 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)3 MessageType (org.apache.parquet.schema.MessageType)3 ByteBuffer (java.nio.ByteBuffer)2 HashSet (java.util.HashSet)2 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 BytesInput (org.apache.parquet.bytes.BytesInput)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2