use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpsertsInternal.
/**
* Test one of HoodieWriteClient upsert(Prepped) APIs.
*
* @param config Write Config
* @param writeFn One of Hoodie Write Function API
* @throws Exception in case of error
*/
private void testUpsertsInternal(HoodieWriteConfig config, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPrepped) throws Exception {
// Force using older timeline layout
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withRollbackUsingMarkers(true).withProps(config.getProps()).withTimelineLayoutVersion(VERSION_0).build();
HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_0).setPopulateMetaFields(config.populateMetaFields()).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
// Write 1 (only inserts)
String newCommitTime = "001";
String initCommitTime = "000";
int numRecords = 200;
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert, isPrepped, true, numRecords, config.populateMetaFields());
// Write 2 (updates)
String prevCommitTime = newCommitTime;
newCommitTime = "004";
numRecords = 100;
String commitTimeBetweenPrevAndNew = "002";
updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true, numRecords, 200, 2, config.populateMetaFields());
// Delete 1
prevCommitTime = newCommitTime;
newCommitTime = "005";
numRecords = 50;
deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, 0, 150, config.populateMetaFields());
// Now simulate an upgrade and perform a restore operation
HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).build();
client = getHoodieWriteClient(newConfig);
client.savepoint("004", "user1", "comment1");
client.restoreToInstant("004");
assertFalse(metaClient.reloadActiveTimeline().getRollbackTimeline().lastInstant().isPresent());
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
}
assertEquals(200, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + 200 + " records");
// Perform Delete again on upgraded dataset.
prevCommitTime = newCommitTime;
newCommitTime = "006";
numRecords = 50;
deleteBatch(newConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, 0, 150);
HoodieActiveTimeline activeTimeline = new HoodieActiveTimeline(metaClient, false);
List<HoodieInstant> instants = activeTimeline.getCommitTimeline().getInstants().collect(Collectors.toList());
assertEquals(5, instants.size());
assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "001"), instants.get(0));
assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "004"), instants.get(1));
// New Format should have all states of instants
assertEquals(new HoodieInstant(REQUESTED, COMMIT_ACTION, "006"), instants.get(2));
assertEquals(new HoodieInstant(INFLIGHT, COMMIT_ACTION, "006"), instants.get(3));
assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "006"), instants.get(4));
final HoodieWriteConfig cfg = hoodieWriteConfig;
final String instantTime = "007";
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build();
String basePathStr = basePath;
HoodieTable table = getHoodieTable(metaClient, cfg);
String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
jsc.parallelize(Arrays.asList(1)).map(e -> {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(metaClient.getActiveTimeline().getInstantDetails(metaClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get()).get(), HoodieCommitMetadata.class);
String filePath = commitMetadata.getPartitionToWriteStats().values().stream().flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(extension)).findAny().map(ee -> ee.getPath()).orElse(null);
String partitionPath = commitMetadata.getPartitionToWriteStats().values().stream().flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(extension)).findAny().map(ee -> ee.getPartitionPath()).orElse(null);
Path baseFilePath = new Path(basePathStr, filePath);
HoodieBaseFile baseFile = new HoodieBaseFile(baseFilePath.toString());
try {
HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(), partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(), config.populateMetaFields() ? Option.empty() : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
WriteStatus writeStatus = new WriteStatus(false, 0.0);
writeStatus.setStat(new HoodieWriteStat());
writeStatus.getStat().setNumWrites(0);
handle.performMergeDataValidationCheck(writeStatus);
} catch (HoodieCorruptedDataException e1) {
fail("Exception not expected because merge validation check is disabled");
}
try {
final String newInstantTime = "006";
cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true");
HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build();
HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(), partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(), config.populateMetaFields() ? Option.empty() : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
WriteStatus writeStatus = new WriteStatus(false, 0.0);
writeStatus.setStat(new HoodieWriteStat());
writeStatus.getStat().setNumWrites(0);
handle.performMergeDataValidationCheck(writeStatus);
fail("The above line should have thrown an exception");
} catch (HoodieCorruptedDataException e2) {
// expected
}
return true;
}).collect();
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestMultiFS method readLocalWriteHDFS.
@Test
public void readLocalWriteHDFS() throws Exception {
// Initialize table and filesystem
HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).initTable(hadoopConf, dfsBasePath);
// Create write client to write some records in
HoodieWriteConfig cfg = getHoodieWriteConfig(dfsBasePath);
HoodieWriteConfig localConfig = getHoodieWriteConfig(tablePath);
HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).setRecordKeyFields(localConfig.getProps().getProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key())).setPartitionFields(localConfig.getProps().getProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())).initTable(hadoopConf, tablePath);
try (SparkRDDWriteClient hdfsWriteClient = getHoodieWriteClient(cfg);
SparkRDDWriteClient localWriteClient = getHoodieWriteClient(localConfig)) {
// Write generated data to hdfs (only inserts)
String readCommitTime = hdfsWriteClient.startCommit();
LOG.info("Starting commit " + readCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(readCommitTime, 100);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
hdfsWriteClient.upsert(writeRecords, readCommitTime);
// Read from hdfs
FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf());
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(dfsBasePath).build();
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
Dataset<Row> readRecords = HoodieClientTestUtils.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
assertEquals(readRecords.count(), records.size(), "Should contain 100 records");
// Write to local
HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).initTable(hadoopConf, tablePath);
String writeCommitTime = localWriteClient.startCommit();
LOG.info("Starting write commit " + writeCommitTime);
List<HoodieRecord> localRecords = dataGen.generateInserts(writeCommitTime, 100);
JavaRDD<HoodieRecord> localWriteRecords = jsc.parallelize(localRecords, 1);
LOG.info("Writing to path: " + tablePath);
localWriteClient.upsert(localWriteRecords, writeCommitTime);
LOG.info("Reading from path: " + tablePath);
fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf());
metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(tablePath).build();
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
Dataset<Row> localReadRecords = HoodieClientTestUtils.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
assertEquals(localReadRecords.count(), localRecords.size(), "Should contain 100 records");
}
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class SparkBootstrapCommitActionExecutor method commit.
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<HoodieData<WriteStatus>> result, List<HoodieWriteStat> stats) {
String actionType = table.getMetaClient().getCommitActionType();
LOG.info("Committing " + instantTime + ", action Type " + actionType);
// Create a Hoodie table which encapsulated the commits and files visible
HoodieSparkTable table = HoodieSparkTable.create(config, context);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
result.setCommitted(true);
stats.forEach(stat -> metadata.addWriteStat(stat.getPartitionPath(), stat));
result.setWriteStats(stats);
// Finalize write
finalizeWrite(instantTime, stats, result);
// add in extra metadata
if (extraMetadata.isPresent()) {
extraMetadata.get().forEach(metadata::addMetadata);
}
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, getSchemaToStoreInCommit());
metadata.setOperationType(operationType);
writeTableMetadata(metadata, actionType);
try {
activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
LOG.info("Committed " + instantTime);
} catch (IOException e) {
throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime, e);
}
result.setCommitMetadata(Option.of(metadata));
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class BaseSparkCommitActionExecutor method commit.
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<HoodieData<WriteStatus>> result, List<HoodieWriteStat> writeStats) {
String actionType = getCommitActionType();
LOG.info("Committing " + instantTime + ", action Type " + actionType + ", operation Type " + operationType);
result.setCommitted(true);
result.setWriteStats(writeStats);
// Finalize write
finalizeWrite(instantTime, writeStats, result);
try {
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieCommitMetadata metadata = result.getCommitMetadata().get();
writeTableMetadata(metadata, actionType);
activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
LOG.info("Committed " + instantTime);
result.setCommitMetadata(Option.of(metadata));
} catch (IOException e) {
throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime, e);
}
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class RollbackUtils method generateRollbackRequestsUsingFileListingMOR.
/**
* Generate all rollback requests that we need to perform for rolling back this action without actually performing rolling back for MOR table type.
*
* @param instantToRollback Instant to Rollback
* @param table instance of {@link HoodieTable} to use.
* @param context instance of {@link HoodieEngineContext} to use.
* @return list of rollback requests
*/
public static List<ListingBasedRollbackRequest> generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException {
String commit = instantToRollback.getTimestamp();
HoodieWriteConfig config = table.getConfig();
List<String> partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getBasePath(), false, false);
if (partitions.isEmpty()) {
return new ArrayList<>();
}
int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests");
return context.flatMap(partitions, partitionPath -> {
HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline();
List<ListingBasedRollbackRequest> partitionRollbackRequests = new ArrayList<>();
switch(instantToRollback.getAction()) {
case HoodieTimeline.COMMIT_ACTION:
case HoodieTimeline.REPLACE_COMMIT_ACTION:
LOG.info("Rolling back commit action.");
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
break;
case HoodieTimeline.COMPACTION_ACTION:
// If there is no delta commit present after the current commit (if compaction), no action, else we
// need to make sure that a compaction commit rollback also deletes any log files written as part of the
// succeeding deltacommit.
boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants().findInstantsAfter(commit, 1).empty();
if (higherDeltaCommits) {
// Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled
// and has not yet finished. In this scenario we should delete only the newly created base files
// and not corresponding base commit log files created with this as baseCommit since updates would
// have been written to the log files.
LOG.info("Rolling back compaction. There are higher delta commits. So only deleting data files");
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataFilesOnlyAction(partitionPath));
} else {
// No deltacommits present after this compaction commit (inflight or requested). In this case, we
// can also delete any log files that were created with this compaction commit as base
// commit.
LOG.info("Rolling back compaction plan. There are NO higher delta commits. So deleting both data and" + " log files");
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
}
break;
case HoodieTimeline.DELTA_COMMIT_ACTION:
// --------------------------------------------------------------------------------------------------
// (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal
// --------------------------------------------------------------------------------------------------
// (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In
// this scenario we would want to delete these log files.
// (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario,
// HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks.
// (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is
// being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and
// and hence will end up deleting these log files. This is done so there are no orphan log files
// lying around.
// (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions
// taken in this scenario is a combination of (A.2) and (A.3)
// ---------------------------------------------------------------------------------------------------
// (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal
// ---------------------------------------------------------------------------------------------------
// (B.1) Failed first commit - Inserts were written to base files and HoodieWriteStat has no entries.
// In this scenario, we delete all the base files written for the failed commit.
// (B.2) Failed recurring commits - Inserts were written to base files and updates to log files. In
// this scenario, perform (A.1) and for updates written to log files, write rollback blocks.
// (B.3) Rollback triggered for first commit - Same as (B.1)
// (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files
// as well if the base base file gets deleted.
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(instantToRollback).get(), HoodieCommitMetadata.class);
// In case all data was inserts and the commit failed, delete the file belonging to that commit
// We do not know fileIds for inserts (first inserts are either log files or base files),
// delete all files for the corresponding failed commit, if present (same as COW)
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
// append rollback blocks for updates and inserts as A.2 and B.2
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
partitionRollbackRequests.addAll(generateAppendRollbackBlocksAction(partitionPath, instantToRollback, commitMetadata, table));
}
break;
default:
break;
}
return partitionRollbackRequests.stream();
}, Math.min(partitions.size(), sparkPartitions)).stream().filter(Objects::nonNull).collect(Collectors.toList());
}
Aggregations