use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testTableOperationsWithRestore.
/**
* Test several table operations with restore. This test uses SparkRDDWriteClient.
* Once the restore support is ready in HoodieTestTable, then rewrite this test.
*/
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testTableOperationsWithRestore(HoodieTableType tableType) throws Exception {
init(tableType);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false).withRollbackUsingMarkers(false).build();
testTableOperationsImpl(engineContext, writeConfig);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testMetadataMultiWriter.
/**
* Test multi-writer on metadata table with optimistic concurrency.
*/
@Test
public void testMetadataMultiWriter() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
Properties properties = new Properties();
properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks");
properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "1000");
properties.setProperty(LockConfiguration.LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY, "20");
HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(false).build()).withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL).withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()).withProperties(properties).build();
ExecutorService executors = Executors.newFixedThreadPool(dataGen.getPartitionPaths().length);
// Create clients in advance
SparkRDDWriteClient[] writeClients = new SparkRDDWriteClient[dataGen.getPartitionPaths().length];
for (int i = 0; i < dataGen.getPartitionPaths().length; i++) {
writeClients[i] = new SparkRDDWriteClient(engineContext, writeConfig);
}
// Parallel commits for separate partitions
List<Future> futures = new LinkedList<>();
for (int i = 0; i < dataGen.getPartitionPaths().length; ++i) {
final int index = i;
String newCommitTime = "000000" + (index + 1);
Future future = executors.submit(() -> {
List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 100, dataGen.getPartitionPaths()[index]);
SparkRDDWriteClient writeClient = writeClients[index];
writeClient.startCommitWithTime(newCommitTime);
List<WriteStatus> writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
});
futures.add(future);
}
// Wait for all commits to complete
for (Future future : futures) {
future.get();
}
// Ensure all commits were synced to the Metadata Table
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
assertEquals(metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().countInstants(), 4);
assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000001")));
assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000002")));
assertTrue(metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "0000003")));
// Compaction may occur if the commits completed in order
assertTrue(metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants() <= 1);
// Validation
validateMetadata(writeClients[0]);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testReattemptOfFailedClusteringCommit.
/**
* Lets say clustering commit succeeded in metadata table, but failed before committing to datatable.
* Next time, when clustering kicks in, hudi will rollback pending clustering (in data table) and re-attempt the clustering with same
* instant time. So, this test ensures the 2nd attempt succeeds with metadata enabled.
* This is applicable to any table service where instant time is fixed. So, how many ever times the operation fails, re attempt will
* be made with same commit time.
* Tests uses clustering to test out the scenario.
*/
@Test
public void testReattemptOfFailedClusteringCommit() throws Exception {
tableType = HoodieTableType.COPY_ON_WRITE;
init(tableType);
context = new HoodieSparkEngineContext(jsc);
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false);
SparkRDDWriteClient client = getHoodieWriteClient(config);
// Write 1 (Bulk insert)
String newCommitTime = "0000001";
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
client.startCommitWithTime(newCommitTime);
List<WriteStatus> writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Write 2 (inserts)
newCommitTime = "0000002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringSortColumns("_row_key").withInlineClustering(true).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
HoodieWriteConfig newWriteConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER).withAutoCommit(false).withClusteringConfig(clusteringConfig).build();
// trigger clustering
SparkRDDWriteClient newClient = getHoodieWriteClient(newWriteConfig);
String clusteringCommitTime = newClient.scheduleClustering(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = newClient.cluster(clusteringCommitTime, true);
// collect replaceFileIds for validation later.
Set<HoodieFileGroupId> replacedFileIds = new HashSet<>();
clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
// trigger new write to mimic other writes succeeding before re-attempt.
newCommitTime = "0000003";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// manually remove clustering completed instant from .hoodie folder and to mimic succeeded clustering in metadata table, but failed in data table.
FileCreateUtils.deleteReplaceCommit(basePath, clusteringCommitTime);
HoodieWriteMetadata<JavaRDD<WriteStatus>> updatedClusterMetadata = newClient.cluster(clusteringCommitTime, true);
metaClient.reloadActiveTimeline();
Set<HoodieFileGroupId> updatedReplacedFileIds = new HashSet<>();
updatedClusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> updatedReplacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
assertEquals(replacedFileIds, updatedReplacedFileIds);
validateMetadata(client);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testRollbackOfPartiallyFailedCommitWithNewPartitions.
/**
* Tests rollback of a commit which has new partitions which is not present in hudi table prior to the commit being rolledback.
*
* @throws Exception
*/
@Test
public void testRollbackOfPartiallyFailedCommitWithNewPartitions() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, true, false, true, false, false).build(), true)) {
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 10);
List<HoodieRecord> upsertRecords = new ArrayList<>();
for (HoodieRecord entry : records) {
if (entry.getPartitionPath().equals(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) || entry.getPartitionPath().equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)) {
upsertRecords.add(entry);
}
}
List<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(upsertRecords, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// There is no way to simulate failed commit on the main dataset, hence we simply delete the completed
// instant so that only the inflight is left over.
String commitInstantFileName = HoodieTimeline.makeCommitFileName(newCommitTime);
assertTrue(fs.delete(new Path(basePath + Path.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME, commitInstantFileName), false));
}
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, true, true, false, true, false, false).build(), true)) {
String newCommitTime = client.startCommit();
// Next insert
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
List<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
}
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testTableOperationsForMetaIndexImpl.
private void testTableOperationsForMetaIndexImpl(final HoodieWriteConfig writeConfig) throws Exception {
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
testTableOperationsImpl(engineContext, writeConfig);
}
Aggregations