use of parquet.hadoop.metadata.ColumnChunkMetaData in project presto by prestodb.
the class ParquetMetadataReader method readFooter.
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException {
FileStatus fileStatus = fileSystem.getFileStatus(file);
try (FSDataInputStream inputStream = fileSystem.open(file)) {
// Parquet File Layout:
//
// MAGIC
// variable: Data
// variable: Metadata
// 4 bytes: MetadataLength
// MAGIC
long length = fileStatus.getLen();
validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;
inputStream.seek(metadataLengthIndex);
int metadataLength = readIntLittleEndian(inputStream);
byte[] magic = new byte[MAGIC.length];
inputStream.readFully(magic);
validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
long metadataIndex = metadataLengthIndex - metadataLength;
validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex);
inputStream.seek(metadataIndex);
FileMetaData fileMetaData = readFileMetaData(inputStream);
List<SchemaElement> schema = fileMetaData.getSchema();
validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
MessageType messageType = readParquetSchema(schema);
List<BlockMetaData> blocks = new ArrayList<>();
List<RowGroup> rowGroups = fileMetaData.getRow_groups();
if (rowGroups != null) {
for (RowGroup rowGroup : rowGroups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
ColumnMetaData metaData = columnChunk.meta_data;
String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
ColumnPath columnPath = ColumnPath.get(path);
ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<>();
List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
if (keyValueList != null) {
for (KeyValue keyValue : keyValueList) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
}
use of parquet.hadoop.metadata.ColumnChunkMetaData in project presto by prestodb.
the class ParquetReader method readPrimitive.
private Block readPrimitive(ColumnDescriptor columnDescriptor, Type type, IntList offsets) throws IOException {
ParquetColumnReader columnReader = columnReadersMap.get(columnDescriptor);
if (columnReader.getPageReader() == null) {
validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
long startingPosition = metadata.getStartingPos();
int totalSize = toIntExact(metadata.getTotalSize());
byte[] buffer = allocateBlock(totalSize);
dataSource.readFully(startingPosition, buffer);
ParquetColumnChunkDescriptor descriptor = new ParquetColumnChunkDescriptor(columnDescriptor, metadata, totalSize);
ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0);
columnReader.setPageReader(columnChunk.readAllPages());
}
return columnReader.readPrimitive(type, offsets);
}
use of parquet.hadoop.metadata.ColumnChunkMetaData in project presto by prestodb.
the class ParquetPredicateUtils method getDictionariesByColumnOrdinal.
private static Map<Integer, ParquetDictionaryDescriptor> getDictionariesByColumnOrdinal(BlockMetaData blockMetadata, ParquetDataSource dataSource, MessageType requestedSchema, TupleDomain<HiveColumnHandle> effectivePredicate) {
ImmutableMap.Builder<Integer, ParquetDictionaryDescriptor> dictionaries = ImmutableMap.builder();
for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) {
ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal);
for (int i = 0; i < requestedSchema.getColumns().size(); i++) {
ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(i);
if (isColumnPredicate(columnDescriptor, effectivePredicate) && columnChunkMetaData.getPath().equals(ColumnPath.get(columnDescriptor.getPath())) && isOnlyDictionaryEncodingPages(columnChunkMetaData.getEncodings())) {
try {
int totalSize = toIntExact(columnChunkMetaData.getTotalSize());
byte[] buffer = new byte[totalSize];
dataSource.readFully(columnChunkMetaData.getStartingPos(), buffer);
Optional<ParquetDictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnChunkMetaData.getCodec());
dictionaries.put(ordinal, new ParquetDictionaryDescriptor(columnDescriptor, dictionaryPage));
} catch (IOException ignored) {
}
break;
}
}
}
return dictionaries.build();
}
Aggregations