use of org.apache.parquet.format.SchemaElement in project drill by apache.
the class ParquetFooterStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
// map from column name to ColumnDescriptor
Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
// map from column name to ColumnChunkMetaData
final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
// map from column name to MajorType
final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
// map from column name to SchemaElement
final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
if (fields.contains(schemaPath)) {
columnDescMap.put(schemaPath, column);
}
}
for (final SchemaElement se : fileMetaData.getSchema()) {
final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
if (fields.contains(schemaPath)) {
schemaElementMap.put(schemaPath, se);
}
}
for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
if (fields.contains(schemaPath)) {
columnChkMetaMap.put(schemaPath, colMetaData);
}
}
for (final SchemaPath path : fields) {
if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
ColumnDescriptor columnDesc = columnDescMap.get(path);
SchemaElement se = schemaElementMap.get(path);
ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
columnTypeMap.put(path, type);
Statistics stat = metaData.getStatistics();
if (type.getMinorType() == TypeProtos.MinorType.DATE) {
stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
}
statMap.put(path, new ColumnStatistics(stat, type));
} else {
final String columnName = path.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(path, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.format.SchemaElement in project drill by apache.
the class ParquetReaderUtility method getColNameToSchemaElementMapping.
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
HashMap<String, SchemaElement> schemaElements = new HashMap<>();
FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (SchemaElement se : fileMetaData.getSchema()) {
schemaElements.put(se.getName(), se);
}
return schemaElements;
}
use of org.apache.parquet.format.SchemaElement in project drill by apache.
the class ParquetSchema method loadParquetSchema.
/**
* Scan the Parquet footer, then map each Parquet column to the list of columns
* we want to read. Track those to be read.
*/
private void loadParquetSchema() {
// TODO - figure out how to deal with this better once we add nested reading, note also look where this map is used below
// store a map from column name to converted types if they are non-null
Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
// loop to add up the length of the fixed width columns and build the schema
for (ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
ParquetColumnMetadata columnMetadata = new ParquetColumnMetadata(column);
columnMetadata.resolveDrillType(schemaElements, options);
if (!fieldSelected(columnMetadata.field)) {
continue;
}
selectedColumnMetadata.add(columnMetadata);
}
}
Aggregations