use of org.apache.hudi.common.util.ParquetUtils in project hudi by apache.
the class HoodieSparkBootstrapSchemaProvider method getBootstrapSourceSchemaParquet.
private static Schema getBootstrapSourceSchemaParquet(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) {
MessageType parquetSchema = new ParquetUtils().readSchema(context.getHadoopConf().get(), filePath);
ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter(Boolean.parseBoolean(SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()), Boolean.parseBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString()));
StructType sparkSchema = converter.convert(parquetSchema);
String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName());
String structName = tableName + "_record";
String recordNamespace = "hoodie." + tableName;
return AvroConversionUtils.convertStructTypeToAvroSchema(sparkSchema, structName, recordNamespace);
}
use of org.apache.hudi.common.util.ParquetUtils in project hudi by apache.
the class ColumnStatsIndexHelper method buildColumnStatsTableFor.
/**
* Parse min/max statistics from Parquet footers for provided columns and composes column-stats
* index table in the following format with 3 statistics denominated for each
* linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
* column {@code A}:
*
* <pre>
* +---------------------------+------------+------------+-------------+
* | file | A_minValue | A_maxValue | A_num_nulls |
* +---------------------------+------------+------------+-------------+
* | one_base_file.parquet | 1 | 10 | 0 |
* | another_base_file.parquet | -10 | 0 | 5 |
* +---------------------------+------------+------------+-------------+
* </pre>
*
* NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
* does not support statistics for it.
*
* TODO leverage metadata table after RFC-27 lands
* @VisibleForTesting
*
* @param sparkSession encompassing Spark session
* @param baseFilesPaths list of base-files paths to be sourced for column-stats index
* @param orderedColumnSchemas target ordered columns
* @return Spark's {@link Dataset} holding an index table
*/
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
SparkContext sc = sparkSession.sparkContext();
JavaSparkContext jsc = new JavaSparkContext(sc);
SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
int numParallelism = (baseFilesPaths.size() / 3 + 1);
List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
try {
jsc.setJobDescription("Listing parquet column statistics");
colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
Iterable<String> iterable = () -> paths;
return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
}).collect();
} finally {
jsc.setJobDescription(previousJobDescription);
}
// Group column's metadata by file-paths of the files it belongs to
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
int colSize = fileColumnsMetadata.size();
if (colSize == 0) {
return null;
}
String filePath = fileColumnsMetadata.get(0).getFilePath();
List<Object> indexRow = new ArrayList<>();
// First columns of the Z-index's row is target file-path
indexRow.add(filePath);
// For each column
orderedColumnSchemas.forEach(colSchema -> {
String colName = colSchema.name();
HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
DataType colType = colSchema.dataType();
if (colMetadata == null || colType == null) {
throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
}
Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
// min
indexRow.add(minMaxValue.getLeft());
// max
indexRow.add(minMaxValue.getRight());
indexRow.add(colMetadata.getNullCount());
});
return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
}).filter(Objects::nonNull);
StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
use of org.apache.hudi.common.util.ParquetUtils in project hudi by apache.
the class HoodieTableMetadataUtil method getColumnStats.
private static Stream<HoodieRecord> getColumnStats(final String partitionPath, final String filePathWithPartition, HoodieTableMetaClient datasetMetaClient, List<String> columnsToIndex, boolean isDeleted) {
final String partition = getPartition(partitionPath);
final int offset = partition.equals(NON_PARTITIONED_NAME) ? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
final String fileName = filePathWithPartition.substring(offset);
if (filePathWithPartition.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
final Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePathWithPartition);
List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList;
if (!isDeleted) {
columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
} else {
// TODO we should delete records instead of stubbing them
columnRangeMetadataList = columnsToIndex.stream().map(entry -> new HoodieColumnRangeMetadata<Comparable>(fileName, entry, null, null, 0, 0, 0, 0)).collect(Collectors.toList());
}
return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, isDeleted);
} else {
throw new HoodieException("Column range index not supported for filePathWithPartition " + fileName);
}
}
Aggregations