use of org.apache.parquet.hadoop.metadata.ColumnChunkMetaData in project drill by apache.
the class ParquetFooterStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
// map from column name to ColumnDescriptor
Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
// map from column name to ColumnChunkMetaData
final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
// map from column name to MajorType
final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
// map from column name to SchemaElement
final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
if (fields.contains(schemaPath)) {
columnDescMap.put(schemaPath, column);
}
}
for (final SchemaElement se : fileMetaData.getSchema()) {
final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
if (fields.contains(schemaPath)) {
schemaElementMap.put(schemaPath, se);
}
}
for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
if (fields.contains(schemaPath)) {
columnChkMetaMap.put(schemaPath, colMetaData);
}
}
for (final SchemaPath path : fields) {
if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
ColumnDescriptor columnDesc = columnDescMap.get(path);
SchemaElement se = schemaElementMap.get(path);
ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
columnTypeMap.put(path, type);
Statistics stat = metaData.getStatistics();
if (type.getMinorType() == TypeProtos.MinorType.DATE) {
stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
}
statMap.put(path, new ColumnStatistics(stat, type));
} else {
final String columnName = path.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(path, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.hadoop.metadata.ColumnChunkMetaData in project drill by apache.
the class DrillParquetReader method setup.
@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
try {
this.operatorContext = context;
schema = footer.getFileMetaData().getSchema();
MessageType projection = null;
if (isStarQuery()) {
projection = schema;
} else {
columnsNotFound = new ArrayList<SchemaPath>();
projection = getProjection(schema, getColumns(), columnsNotFound);
if (projection == null) {
projection = schema;
}
if (columnsNotFound != null && columnsNotFound.size() > 0) {
nullFilledVectors = new ArrayList<>();
for (SchemaPath col : columnsNotFound) {
nullFilledVectors.add((NullableIntVector) output.addField(MaterializedField.create(col.getAsUnescapedPath(), org.apache.drill.common.types.Types.optional(TypeProtos.MinorType.INT)), (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)));
}
if (columnsNotFound.size() == getColumns().size()) {
noColumnsFound = true;
}
}
}
logger.debug("Requesting schema {}", projection);
ColumnIOFactory factory = new ColumnIOFactory(false);
MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
Map<ColumnPath, ColumnChunkMetaData> paths = new HashMap<>();
for (ColumnChunkMetaData md : footer.getBlocks().get(entry.getRowGroupIndex()).getColumns()) {
paths.put(md.getPath(), md);
}
Path filePath = new Path(entry.getPath());
BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
recordCount = (int) blockMetaData.getRowCount();
pageReadStore = new ColumnChunkIncReadStore(recordCount, CodecFactory.createDirectCodecFactory(fileSystem.getConf(), new ParquetDirectByteBufferAllocator(operatorContext.getAllocator()), 0), operatorContext.getAllocator(), fileSystem, filePath);
for (String[] path : schema.getPaths()) {
Type type = schema.getType(path);
if (type.isPrimitive()) {
ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
pageReadStore.addColumn(schema.getColumnDescription(path), md);
}
}
if (!noColumnsFound) {
writer = new VectorContainerWriter(output);
// Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
final Collection<SchemaPath> columns = columnsNotFound == null || columnsNotFound.size() == 0 ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
recordMaterializer = new DrillParquetRecordMaterializer(output, writer, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
primitiveVectors = writer.getMapVector().getPrimitiveVectors();
recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
}
} catch (Exception e) {
handleAndRaise("Failure in setting up reader", e);
}
}
use of org.apache.parquet.hadoop.metadata.ColumnChunkMetaData in project drill by apache.
the class ParquetSchema method buildChunkMap.
Map<String, Integer> buildChunkMap(BlockMetaData rowGroupMetadata) {
// the column chunk meta-data is not guaranteed to be in the same order as the columns in the schema
// a map is constructed for fast access to the correct columnChunkMetadata to correspond
// to an element in the schema
Map<String, Integer> columnChunkMetadataPositionsInList = new HashMap<>();
int colChunkIndex = 0;
for (ColumnChunkMetaData colChunk : rowGroupMetadata.getColumns()) {
columnChunkMetadataPositionsInList.put(Arrays.toString(colChunk.getPath().toArray()), colChunkIndex);
colChunkIndex++;
}
return columnChunkMetadataPositionsInList;
}
use of org.apache.parquet.hadoop.metadata.ColumnChunkMetaData in project drill by apache.
the class Metadata method getParquetFileMetadata_v3.
/**
* Get the metadata for a single file
*
* @param file
* @return
* @throws IOException
*/
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
MessageType schema = metadata.getFileMetaData().getSchema();
// Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
schema.getPaths();
for (String[] path : schema.getPaths()) {
colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
}
List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
if (logger.isDebugEnabled()) {
logger.debug(containsCorruptDates.toString());
}
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
long length = 0;
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
ColumnMetadata_v3 columnMetadata;
boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
Statistics<?> stats = col.getStatistics();
String[] columnName = col.getPath().toArray();
SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
if (parquetTableMetadata.columnTypeInfo == null) {
parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
}
// Save the column schema info. We'll merge it into one list
parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
if (statsAvailable) {
// Write stats when they are not null
Object minValue = null;
Object maxValue = null;
if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
minValue = stats.genericGetMin();
maxValue = stats.genericGetMax();
if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
}
}
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
} else {
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
}
columnMetadataList.add(columnMetadata);
length += col.getTotalSize();
}
// Note we still read the schema even if there are no values in the RowGroup
if (rowGroup.getRowCount() == 0) {
continue;
}
RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Aggregations