Search in sources :

Example 36 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class HoodieSparkBootstrapSchemaProvider method getBootstrapSourceSchemaOrc.

private static Schema getBootstrapSourceSchemaOrc(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) {
    Reader orcReader = null;
    try {
        orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(context.getHadoopConf().get()));
    } catch (IOException e) {
        throw new HoodieException("Could not determine schema from the data files.");
    }
    TypeDescription orcSchema = orcReader.getSchema();
    String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName());
    String structName = tableName + "_record";
    String recordNamespace = "hoodie." + tableName;
    return AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, structName, recordNamespace, true);
}
Also used : Reader(org.apache.orc.Reader) TypeDescription(org.apache.orc.TypeDescription) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException)

Example 37 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class ColumnStatsIndexHelper method buildColumnStatsTableFor.

/**
 * Parse min/max statistics from Parquet footers for provided columns and composes column-stats
 * index table in the following format with 3 statistics denominated for each
 * linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
 * column {@code A}:
 *
 * <pre>
 * +---------------------------+------------+------------+-------------+
 * |          file             | A_minValue | A_maxValue | A_num_nulls |
 * +---------------------------+------------+------------+-------------+
 * | one_base_file.parquet     |          1 |         10 |           0 |
 * | another_base_file.parquet |        -10 |          0 |           5 |
 * +---------------------------+------------+------------+-------------+
 * </pre>
 *
 * NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
 * does not support statistics for it.
 *
 * TODO leverage metadata table after RFC-27 lands
 * @VisibleForTesting
 *
 * @param sparkSession encompassing Spark session
 * @param baseFilesPaths list of base-files paths to be sourced for column-stats index
 * @param orderedColumnSchemas target ordered columns
 * @return Spark's {@link Dataset} holding an index table
 */
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
    SparkContext sc = sparkSession.sparkContext();
    JavaSparkContext jsc = new JavaSparkContext(sc);
    SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
    int numParallelism = (baseFilesPaths.size() / 3 + 1);
    List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
    String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
    try {
        jsc.setJobDescription("Listing parquet column statistics");
        colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
            ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
            Iterable<String> iterable = () -> paths;
            return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
        }).collect();
    } finally {
        jsc.setJobDescription(previousJobDescription);
    }
    // Group column's metadata by file-paths of the files it belongs to
    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
    JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
        int colSize = fileColumnsMetadata.size();
        if (colSize == 0) {
            return null;
        }
        String filePath = fileColumnsMetadata.get(0).getFilePath();
        List<Object> indexRow = new ArrayList<>();
        // First columns of the Z-index's row is target file-path
        indexRow.add(filePath);
        // For each column
        orderedColumnSchemas.forEach(colSchema -> {
            String colName = colSchema.name();
            HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
            DataType colType = colSchema.dataType();
            if (colMetadata == null || colType == null) {
                throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
            }
            Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
            // min
            indexRow.add(minMaxValue.getLeft());
            // max
            indexRow.add(minMaxValue.getRight());
            indexRow.add(colMetadata.getNullCount());
        });
        return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
    }).filter(Objects::nonNull);
    StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
    return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) DecimalType(org.apache.spark.sql.types.DecimalType) FileStatus(org.apache.hadoop.fs.FileStatus) ByteBuffer(java.nio.ByteBuffer) Logger(org.apache.log4j.Logger) BigDecimal(java.math.BigDecimal) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DoubleType(org.apache.spark.sql.types.DoubleType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DataTypeUtils.areCompatible(org.apache.hudi.util.DataTypeUtils.areCompatible) IntegerType(org.apache.spark.sql.types.IntegerType) SparkContext(org.apache.spark.SparkContext) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) UUID(java.util.UUID) TimestampType(org.apache.spark.sql.types.TimestampType) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) BooleanType(org.apache.spark.sql.types.BooleanType) Dataset(org.apache.spark.sql.Dataset) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) FloatType(org.apache.spark.sql.types.FloatType) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) LongType$(org.apache.spark.sql.types.LongType$) StructType$(org.apache.spark.sql.types.StructType$) ArrayList(java.util.ArrayList) ByteType(org.apache.spark.sql.types.ByteType) StreamSupport(java.util.stream.StreamSupport) Nonnull(javax.annotation.Nonnull) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) JavaConversions(scala.collection.JavaConversions) Row$(org.apache.spark.sql.Row$) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) ShortType(org.apache.spark.sql.types.ShortType) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) LogManager(org.apache.log4j.LogManager) DateType(org.apache.spark.sql.types.DateType) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) StructType(org.apache.spark.sql.types.StructType) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) HoodieException(org.apache.hudi.exception.HoodieException) SparkContext(org.apache.spark.SparkContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) Objects(java.util.Objects) DataType(org.apache.spark.sql.types.DataType) List(java.util.List) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) Pair(org.apache.hudi.common.util.collection.Pair) Nonnull(javax.annotation.Nonnull)

Example 38 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class TestIncrementalFSViewSync method testRestore.

private void testRestore(SyncableFileSystemView view, List<String> newRestoreInstants, Map<String, List<String>> instantsToFiles, List<HoodieInstant> rolledBackInstants, String emptyRestoreInstant, boolean isRestore, int totalReplacedFileSlicesPerPartition, int totalFilesAddedPerPartitionPerInstant) {
    assertEquals(newRestoreInstants.size(), rolledBackInstants.size());
    long initialFileSlices = partitions.stream().mapToLong(p -> view.getAllFileSlices(p).count()).findAny().getAsLong();
    final int numFileSlicesAddedPerInstant = (totalFilesAddedPerPartitionPerInstant - totalReplacedFileSlicesPerPartition);
    final long expectedLatestFileSlices = fileIdsPerPartition.size() + (rolledBackInstants.size()) * numFileSlicesAddedPerInstant;
    IntStream.range(0, newRestoreInstants.size()).forEach(idx -> {
        HoodieInstant instant = rolledBackInstants.get(idx);
        try {
            boolean isDeltaCommit = HoodieTimeline.DELTA_COMMIT_ACTION.equalsIgnoreCase(instant.getAction());
            performRestore(instant, instantsToFiles.get(instant.getTimestamp()), newRestoreInstants.get(idx), isRestore);
            final long expTotalFileSlicesPerPartition = isDeltaCommit ? initialFileSlices : initialFileSlices - ((idx + 1) * (fileIdsPerPartition.size() - totalReplacedFileSlicesPerPartition));
            view.sync();
            assertTrue(view.getLastInstant().isPresent());
            LOG.info("Last Instant is :" + view.getLastInstant().get());
            if (isRestore) {
                assertEquals(newRestoreInstants.get(idx), view.getLastInstant().get().getTimestamp());
                assertEquals(HoodieTimeline.RESTORE_ACTION, view.getLastInstant().get().getAction());
            }
            assertEquals(State.COMPLETED, view.getLastInstant().get().getState());
            if (HoodieTimeline.compareTimestamps(newRestoreInstants.get(idx), HoodieTimeline.GREATER_THAN_OR_EQUALS, emptyRestoreInstant)) {
                partitions.forEach(p -> assertEquals(0, view.getLatestFileSlices(p).count()));
            } else {
                partitions.forEach(p -> assertEquals(expectedLatestFileSlices - (idx + 1) * numFileSlicesAddedPerInstant, view.getLatestFileSlices(p).count()));
            }
            partitions.forEach(p -> assertEquals(expTotalFileSlicesPerPartition, view.getAllFileSlices(p).count()));
            metaClient.reloadActiveTimeline();
            SyncableFileSystemView newView = getFileSystemView(metaClient);
            areViewsConsistent(view, newView, expTotalFileSlicesPerPartition * partitions.size());
        } catch (IOException e) {
            throw new HoodieException(e);
        }
    });
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 39 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class TestIncrementalFSViewSync method generateDataForInstant.

private List<Pair<String, HoodieWriteStat>> generateDataForInstant(String baseInstant, String instant, boolean deltaCommit, List<String> fileIds) {
    return partitions.stream().flatMap(p -> fileIds.stream().map(f -> {
        try {
            File file = new File(basePath + "/" + p + "/" + (deltaCommit ? FSUtils.makeLogFileName(f, ".log", baseInstant, Integer.parseInt(instant), TEST_WRITE_TOKEN) : FSUtils.makeDataFileName(instant, TEST_WRITE_TOKEN, f)));
            file.createNewFile();
            HoodieWriteStat w = new HoodieWriteStat();
            w.setFileId(f);
            w.setPath(String.format("%s/%s", p, file.getName()));
            return Pair.of(p, w);
        } catch (IOException e) {
            throw new HoodieException(e);
        }
    })).collect(Collectors.toList());
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) COMPACTION_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) IntStream(java.util.stream.IntStream) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) Files(java.nio.file.Files) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Paths(java.nio.file.Paths) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) File(java.io.File)

Example 40 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class TestHoodieActiveTimeline method testCreateNewInstantTime.

@Test
public void testCreateNewInstantTime() throws Exception {
    String lastInstantTime = HoodieActiveTimeline.createNewInstantTime();
    for (int i = 0; i < 3; ++i) {
        String newInstantTime = HoodieActiveTimeline.createNewInstantTime();
        assertTrue(HoodieTimeline.compareTimestamps(lastInstantTime, HoodieTimeline.LESSER_THAN, newInstantTime));
        lastInstantTime = newInstantTime;
    }
    // All zero timestamp can be parsed
    HoodieActiveTimeline.parseDateFromInstantTime("00000000000000");
    // Multiple thread test
    final int numChecks = 100000;
    final int numThreads = 100;
    final long milliSecondsInYear = 365 * 24 * 3600 * 1000;
    ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
    List<Future> futures = new ArrayList<>(numThreads);
    for (int idx = 0; idx < numThreads; ++idx) {
        futures.add(executorService.submit(() -> {
            Date date = new Date(System.currentTimeMillis() + (int) (Math.random() * numThreads) * milliSecondsInYear);
            final String expectedFormat = HoodieActiveTimeline.formatDate(date);
            for (int tidx = 0; tidx < numChecks; ++tidx) {
                final String curFormat = HoodieActiveTimeline.formatDate(date);
                if (!curFormat.equals(expectedFormat)) {
                    throw new HoodieException("Format error: expected=" + expectedFormat + ", curFormat=" + curFormat);
                }
            }
        }));
    }
    executorService.shutdown();
    assertTrue(executorService.awaitTermination(10, TimeUnit.SECONDS));
    // required to catch exceptions
    for (Future f : futures) {
        f.get();
    }
}
Also used : ExecutorService(java.util.concurrent.ExecutorService) ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) HoodieException(org.apache.hudi.exception.HoodieException) Date(java.util.Date) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieException (org.apache.hudi.exception.HoodieException)171 IOException (java.io.IOException)87 Path (org.apache.hadoop.fs.Path)45 Schema (org.apache.avro.Schema)35 HoodieIOException (org.apache.hudi.exception.HoodieIOException)35 List (java.util.List)30 ArrayList (java.util.ArrayList)27 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Collectors (java.util.stream.Collectors)21 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)19 Option (org.apache.hudi.common.util.Option)19 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)18 Map (java.util.Map)16 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)16 GenericRecord (org.apache.avro.generic.GenericRecord)15 Arrays (java.util.Arrays)14 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)14 Logger (org.apache.log4j.Logger)14 FileStatus (org.apache.hadoop.fs.FileStatus)13 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)13