use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieBackedMetadata method testTableOperationsImpl.
/**
* Test all major table operations with the given table, config and context.
*
* @param engineContext - Engine context
* @param writeConfig - Write config
* @throws IOException
*/
private void testTableOperationsImpl(HoodieSparkEngineContext engineContext, HoodieWriteConfig writeConfig) throws IOException {
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
// Write 1 (Bulk insert)
String newCommitTime = "0000001";
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
client.startCommitWithTime(newCommitTime);
List<WriteStatus> writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Write 2 (inserts)
newCommitTime = "0000002";
client.startCommitWithTime(newCommitTime);
validateMetadata(client);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Write 3 (updates)
newCommitTime = "0000003";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUniqueUpdates(newCommitTime, 10);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
// Write 4 (updates and inserts)
newCommitTime = "0000004";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, 10);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Compaction
if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) {
newCommitTime = "0000005";
client.scheduleCompactionAtInstant(newCommitTime, Option.empty());
client.compact(newCommitTime);
validateMetadata(client);
}
// Write 5 (updates and inserts)
newCommitTime = "0000006";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, 5);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
// Compaction
if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) {
newCommitTime = "0000007";
client.scheduleCompactionAtInstant(newCommitTime, Option.empty());
client.compact(newCommitTime);
validateMetadata(client);
}
// Deletes
newCommitTime = "0000009";
records = dataGen.generateDeletes(newCommitTime, 10);
JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey());
client.startCommitWithTime(newCommitTime);
client.delete(deleteKeys, newCommitTime);
// Clean
newCommitTime = "0000009";
client.clean(newCommitTime);
validateMetadata(client);
// Restore
client.restoreToInstant("0000006");
validateMetadata(client);
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieBackedMetadata method testRollbackDuringUpgradeForDoubleLocking.
/**
* When table needs to be upgraded and when multi writer is enabled, hudi rollsback partial commits. Upgrade itself is happening
* within a lock and hence rollback should not lock again.
*
* @throws IOException
* @throws InterruptedException
*/
@Test
public void testRollbackDuringUpgradeForDoubleLocking() throws IOException, InterruptedException {
init(HoodieTableType.COPY_ON_WRITE, false);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Perform a commit. This should bootstrap the metadata table with latest version.
List<HoodieRecord> records;
JavaRDD<WriteStatus> writeStatuses;
String commitTimestamp = HoodieActiveTimeline.createNewInstantTime();
Properties properties = new Properties();
properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks");
properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "3");
properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
HoodieWriteConfig writeConfig = getWriteConfigBuilder(false, true, false).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()).withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL).withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()).withProperties(properties).build();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
records = dataGen.generateInserts(commitTimestamp, 5);
client.startCommitWithTime(commitTimestamp);
writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp);
client.commit(commitTimestamp, writeStatuses);
}
// Metadata table should have been bootstrapped
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus oldStatus = fs.getFileStatus(new Path(metadataTableBasePath));
// trigger partial commit
metaClient.reloadActiveTimeline();
commitTimestamp = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
records = dataGen.generateInserts(commitTimestamp, 5);
client.startCommitWithTime(commitTimestamp);
writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp);
}
// set hoodie.table.version to 2 in hoodie.properties file
changeTableVersion(HoodieTableVersion.TWO);
writeConfig = getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()).withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL).withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()).withProperties(properties).build();
// With next commit the table should be re-bootstrapped and partial commit should be rolled back.
metaClient.reloadActiveTimeline();
commitTimestamp = HoodieActiveTimeline.createNewInstantTime();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
records = dataGen.generateInserts(commitTimestamp, 5);
client.startCommitWithTime(commitTimestamp);
writeStatuses = client.insert(jsc.parallelize(records, 1), commitTimestamp);
assertNoWriteErrors(writeStatuses.collect());
}
initMetaClient();
assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FOUR.versionCode());
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath));
assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime());
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieBackedMetadata method testUpgradeDowngrade.
@Test
public void testUpgradeDowngrade() throws IOException {
init(HoodieTableType.COPY_ON_WRITE, false);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Perform a commit. This should bootstrap the metadata table with latest version.
List<HoodieRecord> records;
List<WriteStatus> writeStatuses;
String commitTimestamp = HoodieActiveTimeline.createNewInstantTime();
HoodieWriteConfig writeConfig = getWriteConfig(true, true);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
records = dataGen.generateInserts(commitTimestamp, 5);
client.startCommitWithTime(commitTimestamp);
writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect();
assertNoWriteErrors(writeStatuses);
}
// Metadata table should have been bootstrapped
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus oldStatus = fs.getFileStatus(new Path(metadataTableBasePath));
// set hoodie.table.version to 2 in hoodie.properties file
changeTableVersion(HoodieTableVersion.TWO);
// With next commit the table should be deleted (as part of upgrade) and then re-bootstrapped automatically
commitTimestamp = HoodieActiveTimeline.createNewInstantTime();
metaClient.reloadActiveTimeline();
FileStatus prevStatus = fs.getFileStatus(new Path(metadataTableBasePath));
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
records = dataGen.generateInserts(commitTimestamp, 5);
client.startCommitWithTime(commitTimestamp);
writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect();
assertNoWriteErrors(writeStatuses);
}
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus currentStatus = fs.getFileStatus(new Path(metadataTableBasePath));
assertTrue(currentStatus.getModificationTime() > prevStatus.getModificationTime());
initMetaClient();
assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FOUR.versionCode());
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath));
assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime());
// Test downgrade by running the downgrader
new UpgradeDowngrade(metaClient, writeConfig, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.TWO, null);
assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.TWO.versionCode());
assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not exist");
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieBackedMetadata method testMultiWriterForDoubleLocking.
/**
* Tests that when inline cleaning is enabled and with auto commit set to true, there is no double locking.
* bcoz, auto clean is triggered within post commit which is already happening within a lock.
*
* @throws Exception
*/
@Test
public void testMultiWriterForDoubleLocking() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
Properties properties = new Properties();
properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks");
properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(true).retainCommits(4).build()).withAutoCommit(false).withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL).withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()).withProperties(properties).build();
SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig);
String partitionPath = dataGen.getPartitionPaths()[0];
for (int j = 0; j < 6; j++) {
String newCommitTime = "000000" + j;
List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 100, partitionPath);
writeClient.startCommitWithTime(newCommitTime);
JavaRDD writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime);
writeClient.commit(newCommitTime, writeStatuses);
}
// Ensure all commits were synced to the Metadata Table
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
LOG.warn("total commits in metadata table " + metadataMetaClient.getActiveTimeline().getCommitsTimeline().countInstants());
// 6 commits and 2 cleaner commits.
assertEquals(metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().countInstants(), 8);
assertTrue(metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants() <= 1);
// Validation
validateMetadata(writeClient);
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieBackedMetadata method testCleaningArchivingAndCompaction.
/**
* Instants on Metadata Table should be archived as per config but we always keep atlest the number of instants
* as on the dataset.
* <p>
* Metadata Table should be automatically compacted as per config.
*/
@Disabled
public void testCleaningArchivingAndCompaction() throws Exception {
init(HoodieTableType.COPY_ON_WRITE, false);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
final int maxDeltaCommitsBeforeCompaction = 3;
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).archiveCommitsWith(40, 60).retainCommits(1).withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER).retainCommits(1).retainFileVersions(1).withAutoClean(true).withAsyncClean(false).build()).build();
List<HoodieRecord> records;
String newCommitTime;
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, config)) {
// compaction will be attempted.
for (int i = 0; i < maxDeltaCommitsBeforeCompaction - 2; ++i) {
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
}
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(config.getBasePath()).build();
// There should not be any compaction yet and we have not performed more than maxDeltaCommitsBeforeCompaction
// deltacommits (1 will be due to bootstrap)
HoodieActiveTimeline metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 0);
assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction - 1);
assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0);
// Next commit will initiate a compaction
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1);
assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction + 1);
assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0);
// More than maxDeltaCommitsBeforeCompaction commits
String inflightCommitTime = newCommitTime;
for (int i = 0; i < maxDeltaCommitsBeforeCompaction + 1; ++i) {
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
if (i == 0) {
// Mark this commit inflight so compactions dont take place
FileCreateUtils.deleteCommit(basePath, newCommitTime);
FileCreateUtils.createInflightCommit(basePath, newCommitTime);
inflightCommitTime = newCommitTime;
}
}
// Ensure no more compactions took place due to the leftover inflight commit
metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1);
assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction) + 1));
// Complete commit
FileCreateUtils.createCommit(basePath, inflightCommitTime);
// Next commit should lead to compaction
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
// Ensure compactions took place
metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 2);
assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction + 1) + 2));
assertTrue(datasetMetaClient.getArchivedTimeline().reload().countInstants() > 0);
validateMetadata(client);
}
}
Aggregations