Search in sources :

Example 1 with HoodieColumnRangeMetadata

use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.

the class ParquetUtils method readRangeFromParquetMetadata.

/**
 * Parse min/max statistics stored in parquet footers for all columns.
 */
public List<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(@Nonnull Configuration conf, @Nonnull Path parquetFilePath, @Nonnull List<String> cols) {
    ParquetMetadata metadata = readMetadata(conf, parquetFilePath);
    // Collect stats from all individual Parquet blocks
    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = metadata.getBlocks().stream().sequential().flatMap(blockMetaData -> blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData -> new HoodieColumnRangeMetadata<Comparable>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(), convertToNativeJavaType(columnChunkMetaData.getPrimitiveType(), columnChunkMetaData.getStatistics().genericGetMin()), convertToNativeJavaType(columnChunkMetaData.getPrimitiveType(), columnChunkMetaData.getStatistics().genericGetMax()), columnChunkMetaData.getStatistics().getNumNulls(), columnChunkMetaData.getValueCount(), columnChunkMetaData.getTotalSize(), columnChunkMetaData.getTotalUncompressedSize()))).collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getColumnName));
    // Combine those into file-level statistics
    // NOTE: Inlining this var makes javac (1.8) upset (due to its inability to infer
    // expression type correctly)
    Stream<HoodieColumnRangeMetadata<Comparable>> stream = columnToStatsListMap.values().stream().map(this::getColumnRangeInFile);
    return stream.collect(Collectors.toList());
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HashMap(java.util.HashMap) Function(java.util.function.Function) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) BigDecimal(java.math.BigDecimal) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BigInteger(java.math.BigInteger) OriginalType(org.apache.parquet.schema.OriginalType) Nonnull(javax.annotation.Nonnull) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) ParquetReader(org.apache.parquet.hadoop.ParquetReader) Set(java.util.Set) DecimalMetadata(org.apache.parquet.schema.DecimalMetadata) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Binary(org.apache.parquet.io.api.Binary) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) Stream(java.util.stream.Stream) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with HoodieColumnRangeMetadata

use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.

the class HoodieAppendHandle method processAppendResult.

private void processAppendResult(AppendResult result, List<IndexedRecord> recordList) {
    HoodieDeltaWriteStat stat = (HoodieDeltaWriteStat) this.writeStatus.getStat();
    if (stat.getPath() == null) {
        // first time writing to this log block.
        updateWriteStatus(stat, result);
    } else if (stat.getPath().endsWith(result.logFile().getFileName())) {
        // append/continued writing to the same log file
        stat.setLogOffset(Math.min(stat.getLogOffset(), result.offset()));
        stat.setFileSizeInBytes(stat.getFileSizeInBytes() + result.size());
        accumulateWriteCounts(stat, result);
        accumulateRuntimeStats(stat);
    } else {
        // written to a newer log file, due to rollover/otherwise.
        initNewStatus();
        stat = (HoodieDeltaWriteStat) this.writeStatus.getStat();
        updateWriteStatus(stat, result);
    }
    if (config.isMetadataIndexColumnStatsForAllColumnsEnabled()) {
        Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMap = stat.getRecordsStats().isPresent() ? stat.getRecordsStats().get().getStats() : new HashMap<>();
        final String filePath = stat.getPath();
        // initialize map of column name to map of stats name to stats value
        Map<String, Map<String, Object>> columnToStats = new HashMap<>();
        writeSchemaWithMetaFields.getFields().forEach(field -> columnToStats.putIfAbsent(field.name(), new HashMap<>()));
        // collect stats for columns at once per record and keep iterating through every record to eventually find col stats for all fields.
        recordList.forEach(record -> aggregateColumnStats(record, writeSchemaWithMetaFields, columnToStats, config.isConsistentLogicalTimestampEnabled()));
        writeSchemaWithMetaFields.getFields().forEach(field -> accumulateColumnRanges(field, filePath, columnRangeMap, columnToStats));
        stat.setRecordsStats(new HoodieDeltaWriteStat.RecordsStats<>(columnRangeMap));
    }
    resetWriteCounts();
    assert stat.getRuntimeStats() != null;
    LOG.info(String.format("AppendHandle for partitionPath %s filePath %s, took %d ms.", partitionPath, stat.getPath(), stat.getRuntimeStats().getTotalUpsertTime()));
    timer.startTimer();
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HashMap(java.util.HashMap) Map(java.util.Map) HashMap(java.util.HashMap)

Example 3 with HoodieColumnRangeMetadata

use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.

the class ColumnStatsIndexHelper method buildColumnStatsTableFor.

/**
 * Parse min/max statistics from Parquet footers for provided columns and composes column-stats
 * index table in the following format with 3 statistics denominated for each
 * linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
 * column {@code A}:
 *
 * <pre>
 * +---------------------------+------------+------------+-------------+
 * |          file             | A_minValue | A_maxValue | A_num_nulls |
 * +---------------------------+------------+------------+-------------+
 * | one_base_file.parquet     |          1 |         10 |           0 |
 * | another_base_file.parquet |        -10 |          0 |           5 |
 * +---------------------------+------------+------------+-------------+
 * </pre>
 *
 * NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
 * does not support statistics for it.
 *
 * TODO leverage metadata table after RFC-27 lands
 * @VisibleForTesting
 *
 * @param sparkSession encompassing Spark session
 * @param baseFilesPaths list of base-files paths to be sourced for column-stats index
 * @param orderedColumnSchemas target ordered columns
 * @return Spark's {@link Dataset} holding an index table
 */
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
    SparkContext sc = sparkSession.sparkContext();
    JavaSparkContext jsc = new JavaSparkContext(sc);
    SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
    int numParallelism = (baseFilesPaths.size() / 3 + 1);
    List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
    String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
    try {
        jsc.setJobDescription("Listing parquet column statistics");
        colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
            ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
            Iterable<String> iterable = () -> paths;
            return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
        }).collect();
    } finally {
        jsc.setJobDescription(previousJobDescription);
    }
    // Group column's metadata by file-paths of the files it belongs to
    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
    JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
        int colSize = fileColumnsMetadata.size();
        if (colSize == 0) {
            return null;
        }
        String filePath = fileColumnsMetadata.get(0).getFilePath();
        List<Object> indexRow = new ArrayList<>();
        // First columns of the Z-index's row is target file-path
        indexRow.add(filePath);
        // For each column
        orderedColumnSchemas.forEach(colSchema -> {
            String colName = colSchema.name();
            HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
            DataType colType = colSchema.dataType();
            if (colMetadata == null || colType == null) {
                throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
            }
            Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
            // min
            indexRow.add(minMaxValue.getLeft());
            // max
            indexRow.add(minMaxValue.getRight());
            indexRow.add(colMetadata.getNullCount());
        });
        return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
    }).filter(Objects::nonNull);
    StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
    return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) DecimalType(org.apache.spark.sql.types.DecimalType) FileStatus(org.apache.hadoop.fs.FileStatus) ByteBuffer(java.nio.ByteBuffer) Logger(org.apache.log4j.Logger) BigDecimal(java.math.BigDecimal) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DoubleType(org.apache.spark.sql.types.DoubleType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DataTypeUtils.areCompatible(org.apache.hudi.util.DataTypeUtils.areCompatible) IntegerType(org.apache.spark.sql.types.IntegerType) SparkContext(org.apache.spark.SparkContext) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) UUID(java.util.UUID) TimestampType(org.apache.spark.sql.types.TimestampType) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) BooleanType(org.apache.spark.sql.types.BooleanType) Dataset(org.apache.spark.sql.Dataset) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) FloatType(org.apache.spark.sql.types.FloatType) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) LongType$(org.apache.spark.sql.types.LongType$) StructType$(org.apache.spark.sql.types.StructType$) ArrayList(java.util.ArrayList) ByteType(org.apache.spark.sql.types.ByteType) StreamSupport(java.util.stream.StreamSupport) Nonnull(javax.annotation.Nonnull) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) JavaConversions(scala.collection.JavaConversions) Row$(org.apache.spark.sql.Row$) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) ShortType(org.apache.spark.sql.types.ShortType) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) LogManager(org.apache.log4j.LogManager) DateType(org.apache.spark.sql.types.DateType) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) StructType(org.apache.spark.sql.types.StructType) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) HoodieException(org.apache.hudi.exception.HoodieException) SparkContext(org.apache.spark.SparkContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) Objects(java.util.Objects) DataType(org.apache.spark.sql.types.DataType) List(java.util.List) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) Pair(org.apache.hudi.common.util.collection.Pair) Nonnull(javax.annotation.Nonnull)

Example 4 with HoodieColumnRangeMetadata

use of org.apache.hudi.common.model.HoodieColumnRangeMetadata in project hudi by apache.

the class HoodieTableMetadataUtil method getColumnStats.

private static Stream<HoodieRecord> getColumnStats(final String partitionPath, final String filePathWithPartition, HoodieTableMetaClient datasetMetaClient, List<String> columnsToIndex, boolean isDeleted) {
    final String partition = getPartition(partitionPath);
    final int offset = partition.equals(NON_PARTITIONED_NAME) ? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
    final String fileName = filePathWithPartition.substring(offset);
    if (filePathWithPartition.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
        final Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePathWithPartition);
        List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList;
        if (!isDeleted) {
            columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
        } else {
            // TODO we should delete records instead of stubbing them
            columnRangeMetadataList = columnsToIndex.stream().map(entry -> new HoodieColumnRangeMetadata<Comparable>(fileName, entry, null, null, 0, 0, 0, 0)).collect(Collectors.toList());
        }
        return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, isDeleted);
    } else {
        throw new HoodieException("Column range index not supported for filePathWithPartition " + fileName);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieException(org.apache.hudi.exception.HoodieException) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString)

Aggregations

HoodieColumnRangeMetadata (org.apache.hudi.common.model.HoodieColumnRangeMetadata)4 Map (java.util.Map)3 Path (org.apache.hadoop.fs.Path)3 IOException (java.io.IOException)2 BigDecimal (java.math.BigDecimal)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 Nonnull (javax.annotation.Nonnull)2 FSUtils (org.apache.hudi.common.fs.FSUtils)2 ParquetUtils (org.apache.hudi.common.util.ParquetUtils)2 HoodieException (org.apache.hudi.exception.HoodieException)2 LogManager (org.apache.log4j.LogManager)2 Logger (org.apache.log4j.Logger)2 BigInteger (java.math.BigInteger)1 ByteBuffer (java.nio.ByteBuffer)1 Arrays (java.util.Arrays)1 HashSet (java.util.HashSet)1 Objects (java.util.Objects)1