Search in sources :

Example 61 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHoodieDeltaStreamerWithMultiWriter method testLatestCheckpointCarryOverWithMultipleWriters.

@Disabled
@ParameterizedTest
@EnumSource(value = HoodieTableType.class, names = { "COPY_ON_WRITE" })
void testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType tableType) throws Exception {
    // NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts
    basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString();
    propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER;
    tableBasePath = basePath + "/testtable_" + tableType;
    prepareInitialConfigs(fs(), basePath, "foo");
    TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
    props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
    props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
    // Keep it higher than batch-size to test continuous mode
    int totalRecords = 3000;
    HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    prepJobConfig.continuousMode = true;
    prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc());
    // Prepare base dataset with some commits
    deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> {
        if (tableType.equals(HoodieTableType.MERGE_ON_READ)) {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs());
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs());
        } else {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs());
        }
        TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        return true;
    });
    // create a backfill job with checkpoint from the first instant
    HoodieDeltaStreamer.Config cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    cfgBackfillJob.continuousMode = false;
    HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build();
    HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
    HoodieCommitMetadata commitMetadataForFirstInstant = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class);
    // run the backfill job
    props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
    props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
    props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
    // get current checkpoint after preparing base dataset with some commits
    HoodieCommitMetadata commitMetadataForLastInstant = getLatestMetadata(meta);
    // Set checkpoint to the last successful position
    cfgBackfillJob.checkpoint = commitMetadataForLastInstant.getMetadata(CHECKPOINT_KEY);
    cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc());
    backfillJob.sync();
    meta.reloadActiveTimeline();
    int totalCommits = meta.getCommitsTimeline().filterCompletedInstants().countInstants();
    // add a new commit to timeline which may not have the checkpoint in extra metadata
    addCommitToTimeline(meta);
    meta.reloadActiveTimeline();
    verifyCommitMetadataCheckpoint(meta, null);
    cfgBackfillJob.checkpoint = null;
    // if deltastreamer checkpoint fetch does not walk back to older commits, this sync will fail
    new HoodieDeltaStreamer(cfgBackfillJob, jsc()).sync();
    meta.reloadActiveTimeline();
    Assertions.assertEquals(totalCommits + 2, meta.getCommitsTimeline().filterCompletedInstants().countInstants());
    verifyCommitMetadataCheckpoint(meta, "00008");
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Disabled(org.junit.jupiter.api.Disabled)

Example 62 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HoodieDeltaStreamerTestBase method addCommitToTimeline.

static void addCommitToTimeline(HoodieTableMetaClient metaClient, WriteOperationType writeOperationType, String commitActiontype, Map<String, String> extraMetadata) throws IOException {
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    commitMetadata.setOperationType(writeOperationType);
    extraMetadata.forEach((k, v) -> commitMetadata.getExtraMetadata().put(k, v));
    String commitTime = HoodieActiveTimeline.createNewInstantTime();
    metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.REQUESTED, commitActiontype, commitTime));
    metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime));
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant)

Example 63 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class BaseCommitActionExecutor method saveWorkloadProfileMetadataToInflight.

/**
 * Save the workload profile in an intermediate file (here re-using commit files) This is useful when performing
 * rollback for MOR tables. Only updates are recorded in the workload profile metadata since updates to log blocks
 * are unknown across batches Inserts (which are new parquet files) are rolled back based on commit time. // TODO :
 * Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
 */
void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, String instantTime) throws HoodieCommitException {
    try {
        HoodieCommitMetadata metadata = new HoodieCommitMetadata();
        profile.getOutputPartitionPaths().forEach(path -> {
            WorkloadStat partitionStat = profile.getOutputWorkloadStat(path);
            HoodieWriteStat insertStat = new HoodieWriteStat();
            insertStat.setNumInserts(partitionStat.getNumInserts());
            insertStat.setFileId("");
            insertStat.setPrevCommit(HoodieWriteStat.NULL_COMMIT);
            metadata.addWriteStat(path, insertStat);
            Map<String, Pair<String, Long>> updateLocationMap = partitionStat.getUpdateLocationToCount();
            Map<String, Pair<String, Long>> insertLocationMap = partitionStat.getInsertLocationToCount();
            Stream.concat(updateLocationMap.keySet().stream(), insertLocationMap.keySet().stream()).distinct().forEach(fileId -> {
                HoodieWriteStat writeStat = new HoodieWriteStat();
                writeStat.setFileId(fileId);
                Pair<String, Long> updateLocation = updateLocationMap.get(fileId);
                Pair<String, Long> insertLocation = insertLocationMap.get(fileId);
                // TODO : Write baseCommitTime is possible here ?
                writeStat.setPrevCommit(updateLocation != null ? updateLocation.getKey() : insertLocation.getKey());
                if (updateLocation != null) {
                    writeStat.setNumUpdateWrites(updateLocation.getValue());
                }
                if (insertLocation != null) {
                    writeStat.setNumInserts(insertLocation.getValue());
                }
                metadata.addWriteStat(path, writeStat);
            });
        });
        metadata.setOperationType(operationType);
        HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
        String commitActionType = getCommitActionType();
        HoodieInstant requested = new HoodieInstant(State.REQUESTED, commitActionType, instantTime);
        activeTimeline.transitionRequestedToInflight(requested, Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)), config.shouldAllowMultiWriteOnSameInstant());
    } catch (IOException io) {
        throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", io);
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair)

Example 64 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class RunCompactionActionExecutor method execute.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute() {
    HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline();
    compactor.preCompact(table, pendingCompactionTimeline, instantTime);
    HoodieWriteMetadata<HoodieData<WriteStatus>> compactionMetadata = new HoodieWriteMetadata<>();
    try {
        // generate compaction plan
        // should support configurable commit metadata
        HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime);
        HoodieData<WriteStatus> statuses = compactor.compact(context, compactionPlan, table, config, instantTime, compactionHandler);
        compactor.maybePersist(statuses, config);
        context.setJobStatus(this.getClass().getSimpleName(), "Preparing compaction metadata");
        List<HoodieWriteStat> updateStatusMap = statuses.map(WriteStatus::getStat).collectAsList();
        HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
        for (HoodieWriteStat stat : updateStatusMap) {
            metadata.addWriteStat(stat.getPartitionPath(), stat);
        }
        metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema());
        compactionMetadata.setWriteStatuses(statuses);
        compactionMetadata.setCommitted(false);
        compactionMetadata.setCommitMetadata(Option.of(metadata));
    } catch (IOException e) {
        throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
    }
    return compactionMetadata;
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieCompactionException(org.apache.hudi.exception.HoodieCompactionException) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) IOException(java.io.IOException) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 65 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testCommitWritesRelativePaths.

/**
 * Test to ensure commit metadata points to valid files.
 */
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testCommitWritesRelativePaths(boolean populateMetaFields) throws Exception {
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build())) {
        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
        HoodieSparkTable table = HoodieSparkTable.create(cfgBuilder.build(), context, metaClient);
        String instantTime = "000";
        client.startCommitWithTime(instantTime);
        List<HoodieRecord> records = dataGen.generateInserts(instantTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
        JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, result), "Commit should succeed");
        assertTrue(testTable.commitExists(instantTime), "After explicit commit, commit file should be created");
        // Get base file paths from commit metadata
        String actionType = metaClient.getCommitActionType();
        HoodieInstant commitInstant = new HoodieInstant(false, actionType, instantTime);
        HoodieTimeline commitTimeline = metaClient.getCommitTimeline().filterCompletedInstants();
        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get(), HoodieCommitMetadata.class);
        String basePath = table.getMetaClient().getBasePath();
        Collection<String> commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values();
        // Read from commit file
        try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime))) {
            String everything = FileIOUtils.readAsUTFString(inputStream);
            HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class);
            HashMap<String, String> paths = metadata.getFileIdAndFullPaths(basePath);
            // Compare values in both to make sure they are equal.
            for (String pathName : paths.values()) {
                assertTrue(commitPathNames.contains(pathName));
            }
        }
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)139 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)64 ArrayList (java.util.ArrayList)54 HashMap (java.util.HashMap)49 List (java.util.List)48 HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)44 IOException (java.io.IOException)42 Test (org.junit.jupiter.api.Test)41 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)40 Map (java.util.Map)38 Path (org.apache.hadoop.fs.Path)36 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)34 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)34 File (java.io.File)26 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 Option (org.apache.hudi.common.util.Option)25 Schema (org.apache.avro.Schema)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)21 Collectors (java.util.stream.Collectors)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20