Search in sources :

Example 1 with ParquetUtils

use of org.apache.hudi.common.util.ParquetUtils in project hudi by apache.

the class HoodieSparkBootstrapSchemaProvider method getBootstrapSourceSchemaParquet.

private static Schema getBootstrapSourceSchemaParquet(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) {
    MessageType parquetSchema = new ParquetUtils().readSchema(context.getHadoopConf().get(), filePath);
    ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter(Boolean.parseBoolean(SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()), Boolean.parseBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString()));
    StructType sparkSchema = converter.convert(parquetSchema);
    String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName());
    String structName = tableName + "_record";
    String recordNamespace = "hoodie." + tableName;
    return AvroConversionUtils.convertStructTypeToAvroSchema(sparkSchema, structName, recordNamespace);
}
Also used : ParquetUtils(org.apache.hudi.common.util.ParquetUtils) StructType(org.apache.spark.sql.types.StructType) ParquetToSparkSchemaConverter(org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter) MessageType(org.apache.parquet.schema.MessageType)

Example 2 with ParquetUtils

use of org.apache.hudi.common.util.ParquetUtils in project hudi by apache.

the class ColumnStatsIndexHelper method buildColumnStatsTableFor.

/**
 * Parse min/max statistics from Parquet footers for provided columns and composes column-stats
 * index table in the following format with 3 statistics denominated for each
 * linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
 * column {@code A}:
 *
 * <pre>
 * +---------------------------+------------+------------+-------------+
 * |          file             | A_minValue | A_maxValue | A_num_nulls |
 * +---------------------------+------------+------------+-------------+
 * | one_base_file.parquet     |          1 |         10 |           0 |
 * | another_base_file.parquet |        -10 |          0 |           5 |
 * +---------------------------+------------+------------+-------------+
 * </pre>
 *
 * NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
 * does not support statistics for it.
 *
 * TODO leverage metadata table after RFC-27 lands
 * @VisibleForTesting
 *
 * @param sparkSession encompassing Spark session
 * @param baseFilesPaths list of base-files paths to be sourced for column-stats index
 * @param orderedColumnSchemas target ordered columns
 * @return Spark's {@link Dataset} holding an index table
 */
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
    SparkContext sc = sparkSession.sparkContext();
    JavaSparkContext jsc = new JavaSparkContext(sc);
    SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
    int numParallelism = (baseFilesPaths.size() / 3 + 1);
    List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
    String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
    try {
        jsc.setJobDescription("Listing parquet column statistics");
        colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
            ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
            Iterable<String> iterable = () -> paths;
            return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
        }).collect();
    } finally {
        jsc.setJobDescription(previousJobDescription);
    }
    // Group column's metadata by file-paths of the files it belongs to
    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
    JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
        int colSize = fileColumnsMetadata.size();
        if (colSize == 0) {
            return null;
        }
        String filePath = fileColumnsMetadata.get(0).getFilePath();
        List<Object> indexRow = new ArrayList<>();
        // First columns of the Z-index's row is target file-path
        indexRow.add(filePath);
        // For each column
        orderedColumnSchemas.forEach(colSchema -> {
            String colName = colSchema.name();
            HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
            DataType colType = colSchema.dataType();
            if (colMetadata == null || colType == null) {
                throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
            }
            Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
            // min
            indexRow.add(minMaxValue.getLeft());
            // max
            indexRow.add(minMaxValue.getRight());
            indexRow.add(colMetadata.getNullCount());
        });
        return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
    }).filter(Objects::nonNull);
    StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
    return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) DecimalType(org.apache.spark.sql.types.DecimalType) FileStatus(org.apache.hadoop.fs.FileStatus) ByteBuffer(java.nio.ByteBuffer) Logger(org.apache.log4j.Logger) BigDecimal(java.math.BigDecimal) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DoubleType(org.apache.spark.sql.types.DoubleType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DataTypeUtils.areCompatible(org.apache.hudi.util.DataTypeUtils.areCompatible) IntegerType(org.apache.spark.sql.types.IntegerType) SparkContext(org.apache.spark.SparkContext) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) UUID(java.util.UUID) TimestampType(org.apache.spark.sql.types.TimestampType) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) BooleanType(org.apache.spark.sql.types.BooleanType) Dataset(org.apache.spark.sql.Dataset) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) FloatType(org.apache.spark.sql.types.FloatType) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) LongType$(org.apache.spark.sql.types.LongType$) StructType$(org.apache.spark.sql.types.StructType$) ArrayList(java.util.ArrayList) ByteType(org.apache.spark.sql.types.ByteType) StreamSupport(java.util.stream.StreamSupport) Nonnull(javax.annotation.Nonnull) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) JavaConversions(scala.collection.JavaConversions) Row$(org.apache.spark.sql.Row$) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) ShortType(org.apache.spark.sql.types.ShortType) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) LogManager(org.apache.log4j.LogManager) DateType(org.apache.spark.sql.types.DateType) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) StructType(org.apache.spark.sql.types.StructType) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) HoodieException(org.apache.hudi.exception.HoodieException) SparkContext(org.apache.spark.SparkContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) Objects(java.util.Objects) DataType(org.apache.spark.sql.types.DataType) List(java.util.List) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) Pair(org.apache.hudi.common.util.collection.Pair) Nonnull(javax.annotation.Nonnull)

Example 3 with ParquetUtils

use of org.apache.hudi.common.util.ParquetUtils in project hudi by apache.

the class HoodieTableMetadataUtil method getColumnStats.

private static Stream<HoodieRecord> getColumnStats(final String partitionPath, final String filePathWithPartition, HoodieTableMetaClient datasetMetaClient, List<String> columnsToIndex, boolean isDeleted) {
    final String partition = getPartition(partitionPath);
    final int offset = partition.equals(NON_PARTITIONED_NAME) ? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
    final String fileName = filePathWithPartition.substring(offset);
    if (filePathWithPartition.endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
        final Path fullFilePath = new Path(datasetMetaClient.getBasePath(), filePathWithPartition);
        List<HoodieColumnRangeMetadata<Comparable>> columnRangeMetadataList;
        if (!isDeleted) {
            columnRangeMetadataList = new ParquetUtils().readRangeFromParquetMetadata(datasetMetaClient.getHadoopConf(), fullFilePath, columnsToIndex);
        } else {
            // TODO we should delete records instead of stubbing them
            columnRangeMetadataList = columnsToIndex.stream().map(entry -> new HoodieColumnRangeMetadata<Comparable>(fileName, entry, null, null, 0, 0, 0, 0)).collect(Collectors.toList());
        }
        return HoodieMetadataPayload.createColumnStatsRecords(partitionPath, columnRangeMetadataList, isDeleted);
    } else {
        throw new HoodieException("Column range index not supported for filePathWithPartition " + fileName);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieException(org.apache.hudi.exception.HoodieException) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString)

Aggregations

ParquetUtils (org.apache.hudi.common.util.ParquetUtils)3 Path (org.apache.hadoop.fs.Path)2 HoodieColumnRangeMetadata (org.apache.hudi.common.model.HoodieColumnRangeMetadata)2 HoodieException (org.apache.hudi.exception.HoodieException)2 StructType (org.apache.spark.sql.types.StructType)2 IOException (java.io.IOException)1 BigDecimal (java.math.BigDecimal)1 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Map (java.util.Map)1 Objects (java.util.Objects)1 UUID (java.util.UUID)1 Collectors (java.util.stream.Collectors)1 StreamSupport (java.util.stream.StreamSupport)1 Nonnull (javax.annotation.Nonnull)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 HoodieAvroUtils.getNestedFieldValAsString (org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString)1