Search in sources :

Example 1 with ColumnMetaData

use of org.apache.parquet.format.ColumnMetaData in project parquet-mr by apache.

the class ParquetMetadataConverter method fromParquetMetadata.

public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
    MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    List<RowGroup> row_groups = parquetMetadata.getRow_groups();
    if (row_groups != null) {
        for (RowGroup rowGroup : row_groups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                if ((filePath == null && columnChunk.getFile_path() != null) || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
                    throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
                }
                ColumnMetaData metaData = columnChunk.meta_data;
                ColumnPath path = getPath(metaData);
                ColumnChunkMetaData column = ColumnChunkMetaData.get(path, messageType.getType(path.toArray()).asPrimitiveType(), fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), fromParquetStatistics(parquetMetadata.getCreated_by(), metaData.statistics, messageType.getType(path.toArray()).asPrimitiveType()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                // TODO
                // index_page_offset
                // key_value_metadata
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }
    Map<String, String> keyValueMetaData = new HashMap<String, String>();
    List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
    if (key_value_metadata != null) {
        for (KeyValue keyValue : key_value_metadata) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()), blocks);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) KeyValue(org.apache.parquet.format.KeyValue) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ColumnChunk(org.apache.parquet.format.ColumnChunk) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) MessageType(org.apache.parquet.schema.MessageType)

Example 2 with ColumnMetaData

use of org.apache.parquet.format.ColumnMetaData in project parquet-mr by apache.

the class ParquetMetadataConverter method getOffset.

// Visible for testing
static long getOffset(ColumnChunk columnChunk) {
    ColumnMetaData md = columnChunk.getMeta_data();
    long offset = md.getData_page_offset();
    if (md.isSetDictionary_page_offset() && offset > md.getDictionary_page_offset()) {
        offset = md.getDictionary_page_offset();
    }
    return offset;
}
Also used : ColumnMetaData(org.apache.parquet.format.ColumnMetaData)

Example 3 with ColumnMetaData

use of org.apache.parquet.format.ColumnMetaData in project parquet-mr by apache.

the class TestParquetMetadataConverter method metadata.

private FileMetaData metadata(long... sizes) {
    List<SchemaElement> schema = emptyList();
    List<RowGroup> rowGroups = new ArrayList<RowGroup>();
    long offset = 0;
    for (long size : sizes) {
        ColumnChunk columnChunk = new ColumnChunk(offset);
        columnChunk.setMeta_data(new ColumnMetaData(INT32, Collections.<org.apache.parquet.format.Encoding>emptyList(), Collections.<String>emptyList(), UNCOMPRESSED, 10l, size * 2, size, offset));
        rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
        offset += size;
    }
    return new FileMetaData(1, schema, sizes.length, rowGroups);
}
Also used : RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) FileMetaData(org.apache.parquet.format.FileMetaData)

Example 4 with ColumnMetaData

use of org.apache.parquet.format.ColumnMetaData in project presto by prestodb.

the class PrimitiveColumnWriter method getColumnMetaData.

// Returns ColumnMetaData that offset is invalid
private ColumnMetaData getColumnMetaData() {
    checkState(getDataStreamsCalled);
    ColumnMetaData columnMetaData = new ColumnMetaData(ParquetTypeConverter.getType(columnDescriptor.getPrimitiveType().getPrimitiveTypeName()), encodings.stream().map(parquetMetadataConverter::getEncoding).collect(toImmutableList()), ImmutableList.copyOf(columnDescriptor.getPath()), compressionCodec.getParquetCompressionCodec(), totalRows, totalUnCompressedSize, totalCompressedSize, -1);
    columnMetaData.setStatistics(ParquetMetadataConverter.toParquetStatistics(columnStatistics));
    return columnMetaData;
}
Also used : ColumnMetaData(org.apache.parquet.format.ColumnMetaData)

Example 5 with ColumnMetaData

use of org.apache.parquet.format.ColumnMetaData in project presto by prestodb.

the class ParquetWriter method updateColumnMetadataOffset.

private List<ColumnMetaData> updateColumnMetadataOffset(List<ColumnMetaData> columns, long offset) {
    ImmutableList.Builder<ColumnMetaData> builder = ImmutableList.builder();
    long currentOffset = offset;
    for (ColumnMetaData column : columns) {
        ColumnMetaData columnMetaData = new ColumnMetaData(column.type, column.encodings, column.path_in_schema, column.codec, column.num_values, column.total_uncompressed_size, column.total_compressed_size, currentOffset);
        columnMetaData.setStatistics(column.getStatistics());
        builder.add(columnMetaData);
        currentOffset += column.getTotal_compressed_size();
    }
    return builder.build();
}
Also used : ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ColumnMetaData(org.apache.parquet.format.ColumnMetaData)

Aggregations

ColumnMetaData (org.apache.parquet.format.ColumnMetaData)7 ArrayList (java.util.ArrayList)4 ColumnChunk (org.apache.parquet.format.ColumnChunk)4 RowGroup (org.apache.parquet.format.RowGroup)4 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)3 HashMap (java.util.HashMap)2 FileMetaData (org.apache.parquet.format.FileMetaData)2 KeyValue (org.apache.parquet.format.KeyValue)2 SchemaElement (org.apache.parquet.format.SchemaElement)2 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)2 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)2 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)2 MessageType (org.apache.parquet.schema.MessageType)2 ParquetCorruptionException (com.facebook.presto.parquet.ParquetCorruptionException)1 ParquetDataSource (com.facebook.presto.parquet.ParquetDataSource)1 ParquetValidationUtils.validateParquet (com.facebook.presto.parquet.ParquetValidationUtils.validateParquet)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 Slice (io.airlift.slice.Slice)1 Slices.wrappedBuffer (io.airlift.slice.Slices.wrappedBuffer)1