use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testUpgradeDowngrade.
@Test
public void testUpgradeDowngrade() throws IOException {
init(HoodieTableType.COPY_ON_WRITE, false);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Perform a commit. This should bootstrap the metadata table with latest version.
List<HoodieRecord> records;
List<WriteStatus> writeStatuses;
String commitTimestamp = HoodieActiveTimeline.createNewInstantTime();
HoodieWriteConfig writeConfig = getWriteConfig(true, true);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
records = dataGen.generateInserts(commitTimestamp, 5);
client.startCommitWithTime(commitTimestamp);
writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect();
assertNoWriteErrors(writeStatuses);
}
// Metadata table should have been bootstrapped
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus oldStatus = fs.getFileStatus(new Path(metadataTableBasePath));
// set hoodie.table.version to 2 in hoodie.properties file
changeTableVersion(HoodieTableVersion.TWO);
// With next commit the table should be deleted (as part of upgrade) and then re-bootstrapped automatically
commitTimestamp = HoodieActiveTimeline.createNewInstantTime();
metaClient.reloadActiveTimeline();
FileStatus prevStatus = fs.getFileStatus(new Path(metadataTableBasePath));
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
records = dataGen.generateInserts(commitTimestamp, 5);
client.startCommitWithTime(commitTimestamp);
writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), commitTimestamp).collect();
assertNoWriteErrors(writeStatuses);
}
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus currentStatus = fs.getFileStatus(new Path(metadataTableBasePath));
assertTrue(currentStatus.getModificationTime() > prevStatus.getModificationTime());
initMetaClient();
assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.FOUR.versionCode());
assertTrue(fs.exists(new Path(metadataTableBasePath)), "Metadata table should exist");
FileStatus newStatus = fs.getFileStatus(new Path(metadataTableBasePath));
assertTrue(oldStatus.getModificationTime() < newStatus.getModificationTime());
// Test downgrade by running the downgrader
new UpgradeDowngrade(metaClient, writeConfig, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.TWO, null);
assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.TWO.versionCode());
assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not exist");
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testMultiWriterForDoubleLocking.
/**
* Tests that when inline cleaning is enabled and with auto commit set to true, there is no double locking.
* bcoz, auto clean is triggered within post commit which is already happening within a lock.
*
* @throws Exception
*/
@Test
public void testMultiWriterForDoubleLocking() throws Exception {
init(HoodieTableType.COPY_ON_WRITE);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
Properties properties = new Properties();
properties.setProperty(FILESYSTEM_LOCK_PATH_PROP_KEY, basePath + "/.hoodie/.locks");
properties.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, false).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withAutoClean(true).retainCommits(4).build()).withAutoCommit(false).withWriteConcurrencyMode(WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL).withLockConfig(HoodieLockConfig.newBuilder().withLockProvider(InProcessLockProvider.class).build()).withProperties(properties).build();
SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig);
String partitionPath = dataGen.getPartitionPaths()[0];
for (int j = 0; j < 6; j++) {
String newCommitTime = "000000" + j;
List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 100, partitionPath);
writeClient.startCommitWithTime(newCommitTime);
JavaRDD writeStatuses = writeClient.insert(jsc.parallelize(records, 1), newCommitTime);
writeClient.commit(newCommitTime, writeStatuses);
}
// Ensure all commits were synced to the Metadata Table
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
LOG.warn("total commits in metadata table " + metadataMetaClient.getActiveTimeline().getCommitsTimeline().countInstants());
// 6 commits and 2 cleaner commits.
assertEquals(metadataMetaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().countInstants(), 8);
assertTrue(metadataMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().countInstants() <= 1);
// Validation
validateMetadata(writeClient);
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestHoodieBackedMetadata method testCleaningArchivingAndCompaction.
/**
* Instants on Metadata Table should be archived as per config but we always keep atlest the number of instants
* as on the dataset.
* <p>
* Metadata Table should be automatically compacted as per config.
*/
@Disabled
public void testCleaningArchivingAndCompaction() throws Exception {
init(HoodieTableType.COPY_ON_WRITE, false);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
final int maxDeltaCommitsBeforeCompaction = 3;
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).archiveCommitsWith(40, 60).retainCommits(1).withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.NEVER).retainCommits(1).retainFileVersions(1).withAutoClean(true).withAsyncClean(false).build()).build();
List<HoodieRecord> records;
String newCommitTime;
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, config)) {
// compaction will be attempted.
for (int i = 0; i < maxDeltaCommitsBeforeCompaction - 2; ++i) {
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
}
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
HoodieTableMetaClient datasetMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(config.getBasePath()).build();
// There should not be any compaction yet and we have not performed more than maxDeltaCommitsBeforeCompaction
// deltacommits (1 will be due to bootstrap)
HoodieActiveTimeline metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 0);
assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction - 1);
assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0);
// Next commit will initiate a compaction
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1);
assertEquals(metadataTimeline.getCommitsTimeline().filterCompletedInstants().countInstants(), maxDeltaCommitsBeforeCompaction + 1);
assertEquals(datasetMetaClient.getArchivedTimeline().reload().countInstants(), 0);
// More than maxDeltaCommitsBeforeCompaction commits
String inflightCommitTime = newCommitTime;
for (int i = 0; i < maxDeltaCommitsBeforeCompaction + 1; ++i) {
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
if (i == 0) {
// Mark this commit inflight so compactions dont take place
FileCreateUtils.deleteCommit(basePath, newCommitTime);
FileCreateUtils.createInflightCommit(basePath, newCommitTime);
inflightCommitTime = newCommitTime;
}
}
// Ensure no more compactions took place due to the leftover inflight commit
metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 1);
assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction) + 1));
// Complete commit
FileCreateUtils.createCommit(basePath, inflightCommitTime);
// Next commit should lead to compaction
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateInserts(newCommitTime, 5);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
// Ensure compactions took place
metadataTimeline = metadataMetaClient.reloadActiveTimeline();
assertEquals(metadataTimeline.getCommitTimeline().filterCompletedInstants().countInstants(), 2);
assertEquals(metadataTimeline.getDeltaCommitTimeline().filterCompletedInstants().countInstants(), ((2 * maxDeltaCommitsBeforeCompaction) + (maxDeltaCommitsBeforeCompaction + 1) + 2));
assertTrue(datasetMetaClient.getArchivedTimeline().reload().countInstants() > 0);
validateMetadata(client);
}
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestSparkHoodieHBaseIndex method setUp.
@BeforeEach
public void setUp() throws Exception {
hadoopConf = jsc().hadoopConfiguration();
hadoopConf.addResource(utility.getConfiguration());
// reInit the context here to keep the hadoopConf the same with that in this class
context = new HoodieSparkEngineContext(jsc());
basePath = utility.getDataTestDirOnTestFS(TABLE_NAME).toString();
metaClient = getHoodieMetaClient(hadoopConf, basePath);
dataGen = new HoodieTestDataGenerator();
}
use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.
the class TestMarkerBasedRollbackStrategy method testMergeOnReadRollback.
@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testMergeOnReadRollback(boolean useFileListingMetadata) throws Exception {
// init MERGE_ON_READ_TABLE
tearDown();
tableType = HoodieTableType.MERGE_ON_READ;
setUp();
HoodieWriteConfig writeConfig = getConfigBuilder().withRollbackUsingMarkers(true).withAutoCommit(false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(useFileListingMetadata).build()).withPath(basePath).build();
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, writeConfig)) {
// rollback 2nd commit and ensure stats reflect the info.
List<HoodieRollbackStat> stats = testRun(useFileListingMetadata, writeConfig, writeClient);
assertEquals(3, stats.size());
for (HoodieRollbackStat stat : stats) {
assertEquals(0, stat.getSuccessDeleteFiles().size());
assertEquals(0, stat.getFailedDeleteFiles().size());
assertEquals(1, stat.getCommandBlocksCount().size());
stat.getCommandBlocksCount().forEach((fileStatus, len) -> assertTrue(fileStatus.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())));
}
}
}
Aggregations