use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.
the class ParquetUtils method readRangeFromParquetMetadata.
/**
* Parse min/max statistics stored in parquet footers for all columns.
*/
public List<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(@Nonnull Configuration conf, @Nonnull Path parquetFilePath, @Nonnull List<String> cols) {
ParquetMetadata metadata = readMetadata(conf, parquetFilePath);
// Collect stats from all individual Parquet blocks
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = metadata.getBlocks().stream().sequential().flatMap(blockMetaData -> blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData -> new HoodieColumnRangeMetadata<Comparable>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(), convertToNativeJavaType(columnChunkMetaData.getPrimitiveType(), columnChunkMetaData.getStatistics().genericGetMin()), convertToNativeJavaType(columnChunkMetaData.getPrimitiveType(), columnChunkMetaData.getStatistics().genericGetMax()), columnChunkMetaData.getStatistics().getNumNulls(), columnChunkMetaData.getValueCount(), columnChunkMetaData.getTotalSize(), columnChunkMetaData.getTotalUncompressedSize()))).collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName));
// Combine those into file-level statistics
// NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer
// expression type correctly)
Stream<HoodieColumnRangeMetadata<Comparable>> stream = columnToStatsListMap.values().stream().map(this::getColumnRangeInFile);
return stream.collect(Collectors.toList());
}
use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.
the class HoodieAppendHandle method processAppendResult.
private void processAppendResult(AppendResult result, List<IndexedRecord> recordList) {
HoodieDeltaWriteStat stat = (HoodieDeltaWriteStat) this.writeStatus.getStat();
if (stat.getPath() == null) {
// first time writing to this log block.
updateWriteStatus(stat, result);
} else if (stat.getPath().endsWith(result.logFile().getFileName())) {
// append/continued writing to the same log file
stat.setLogOffset(Math.min(stat.getLogOffset(), result.offset()));
stat.setFileSizeInBytes(stat.getFileSizeInBytes() + result.size());
accumulateWriteCounts(stat, result);
accumulateRuntimeStats(stat);
} else {
// written to a newer log file, due to rollover/otherwise.
initNewStatus();
stat = (HoodieDeltaWriteStat) this.writeStatus.getStat();
updateWriteStatus(stat, result);
}
if (config.isMetadataIndexColumnStatsForAllColumnsEnabled()) {
Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMap = stat.getRecordsStats().isPresent() ? stat.getRecordsStats().get().getStats() : new HashMap<>();
final String filePath = stat.getPath();
// initialize map of column name to map of stats name to stats value
Map<String, Map<String, Object>> columnToStats = new HashMap<>();
writeSchemaWithMetaFields.getFields().forEach(field -> columnToStats.putIfAbsent(field.name(), new HashMap<>()));
// collect stats for columns at once per record and keep iterating through every record to eventually find col stats for all fields.
recordList.forEach(record -> aggregateColumnStats(record, writeSchemaWithMetaFields, columnToStats, config.isConsistentLogicalTimestampEnabled()));
writeSchemaWithMetaFields.getFields().forEach(field -> accumulateColumnRanges(field, filePath, columnRangeMap, columnToStats));
stat.setRecordsStats(new HoodieDeltaWriteStat.RecordsStats<>(columnRangeMap));
}
resetWriteCounts();
assert stat.getRuntimeStats() != null;
LOG.info(String.format("AppendHandle for partitionPath %s filePath %s, took %d ms.", partitionPath, stat.getPath(), stat.getRuntimeStats().getTotalUpsertTime()));
timer.startTimer();
}
use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.
the class ColumnStatsIndexHelper method buildColumnStatsTableFor.
/**
* Parse min/max statistics from Parquet footers for provided columns and composes column-stats
* index table in the following format with 3 statistics denominated for each
* linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
* column {@code A}:
*
* <pre>
* +---------------------------+------------+------------+-------------+
* | file | A_minValue | A_maxValue | A_num_nulls |
* +---------------------------+------------+------------+-------------+
* | one_base_file.parquet | 1 | 10 | 0 |
* | another_base_file.parquet | -10 | 0 | 5 |
* +---------------------------+------------+------------+-------------+
* </pre>
*
* NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
* does not support statistics for it.
*
* TODO leverage metadata table after RFC-27 lands
* @VisibleForTesting
*
* @param sparkSession encompassing Spark session
* @param baseFilesPaths list of base-files paths to be sourced for column-stats index
* @param orderedColumnSchemas target ordered columns
* @return Spark's {@link Dataset} holding an index table
*/
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
SparkContext sc = sparkSession.sparkContext();
JavaSparkContext jsc = new JavaSparkContext(sc);
SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
int numParallelism = (baseFilesPaths.size() / 3 + 1);
List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
try {
jsc.setJobDescription("Listing parquet column statistics");
colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
Iterable<String> iterable = () -> paths;
return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
}).collect();
} finally {
jsc.setJobDescription(previousJobDescription);
}
// Group column's metadata by file-paths of the files it belongs to
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
int colSize = fileColumnsMetadata.size();
if (colSize == 0) {
return null;
}
String filePath = fileColumnsMetadata.get(0).getFilePath();
List<Object> indexRow = new ArrayList<>();
// First columns of the Z-index's row is target file-path
indexRow.add(filePath);
// For each column
orderedColumnSchemas.forEach(colSchema -> {
String colName = colSchema.name();
HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
DataType colType = colSchema.dataType();
if (colMetadata == null || colType == null) {
throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
}
Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
// min
indexRow.add(minMaxValue.getLeft());
// max
indexRow.add(minMaxValue.getRight());
indexRow.add(colMetadata.getNullCount());
});
return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
}).filter(Objects::nonNull);
StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.
the class HoodieTableMetadataUtil method getColumnStats.
private static Stream<HoodieRecord> getColumnStats(final String partitionPath, final String filePathWithPartition, HoodieTableMetaClient datasetMetaClient, List<String> columnsToIndex, boolean isDeleted) {
final String partition = getPartition(partitionPath);
final int offset = partition.equals(NON_PARTITIONED_NAME) ? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
final String fileName = filePathWithPartition.substring(offset);
if (filePathWithPartition.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
final Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePathWithPartition);
List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList;
if (!isDeleted) {
columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
} else {
// TODO we should delete records instead of stubbing them
columnRangeMetadataList = columnsToIndex.stream().map(entry -> new HoodieColumnRangeMetadata<Comparable>(fileName, entry, null, null, 0, 0, 0, 0)).collect(Collectors.toList());
}
return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, isDeleted);
} else {
throw new HoodieException("Column range index not supported for filePathWithPartition " + fileName);
}
}
Aggregations