use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testSmallInsertHandlingForUpserts.
/**
* Test scenario of new file-group getting added during upsert().
*/
@Test
public void testSmallInsertHandlingForUpserts() throws Exception {
final String testPartitionPath = "2016/09/26";
final int insertSplitLimit = 100;
// setup the small file handling params
// hold upto 200 records max
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
SparkRDDWriteClient client = getHoodieWriteClient(config);
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
// Inserts => will write file1
String commitTime1 = "001";
client.startCommitWithTime(commitTime1);
// this writes ~500kb
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit);
Set<String> keys1 = recordsToRecordKeySet(inserts1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses);
assertEquals(1, statuses.size(), "Just 1 file needs to be added.");
String file1 = statuses.get(0).getFileId();
assertEquals(100, fileUtils.readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())).size(), "file should contain 100 records");
// Update + Inserts such that they just expand file1
String commitTime2 = "002";
client.startCommitWithTime(commitTime2);
List<HoodieRecord> inserts2 = dataGen.generateInserts(commitTime2, 40);
Set<String> keys2 = recordsToRecordKeySet(inserts2);
List<HoodieRecord> insertsAndUpdates2 = new ArrayList<>();
insertsAndUpdates2.addAll(inserts2);
insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1));
JavaRDD<HoodieRecord> insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1);
statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect();
assertNoWriteErrors(statuses);
assertEquals(1, statuses.size(), "Just 1 file needs to be updated.");
assertEquals(file1, statuses.get(0).getFileId(), "Existing file should be expanded");
assertEquals(commitTime1, statuses.get(0).getStat().getPrevCommit(), "Existing file should be expanded");
Path newFile = new Path(basePath, statuses.get(0).getStat().getPath());
assertEquals(140, fileUtils.readRowKeys(hadoopConf, newFile).size(), "file should contain 140 records");
List<GenericRecord> records = fileUtils.readAvroRecords(hadoopConf, newFile);
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit2");
assertTrue(keys2.contains(recordKey) || keys1.contains(recordKey), "key expected to be part of commit2");
}
// update + inserts such that file1 is updated and expanded, a new file2 is created.
String commitTime3 = "003";
client.startCommitWithTime(commitTime3);
List<HoodieRecord> insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200);
Set<String> keys3 = recordsToRecordKeySet(insertsAndUpdates3);
List<HoodieRecord> updates3 = dataGen.generateUpdates(commitTime3, inserts2);
insertsAndUpdates3.addAll(updates3);
JavaRDD<HoodieRecord> insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1);
statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect();
assertNoWriteErrors(statuses);
assertEquals(2, statuses.size(), "2 files needs to be committed.");
HoodieTableMetaClient metadata = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
HoodieTable table = getHoodieTable(metadata, config);
BaseFileOnlyView fileSystemView = table.getBaseFileOnlyView();
List<HoodieBaseFile> files = fileSystemView.getLatestBaseFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList());
int numTotalInsertsInCommit3 = 0;
int numTotalUpdatesInCommit3 = 0;
for (HoodieBaseFile file : files) {
if (file.getFileName().contains(file1)) {
assertEquals(commitTime3, file.getCommitTime(), "Existing file should be expanded");
records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath()));
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
if (recordCommitTime.equals(commitTime3)) {
if (keys2.contains(recordKey)) {
keys2.remove(recordKey);
numTotalUpdatesInCommit3++;
} else {
numTotalInsertsInCommit3++;
}
}
}
assertEquals(0, keys2.size(), "All keys added in commit 2 must be updated in commit3 correctly");
} else {
assertEquals(commitTime3, file.getCommitTime(), "New file must be written for commit 3");
records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath()));
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit3");
assertTrue(keys3.contains(recordKey), "key expected to be part of commit3");
}
numTotalInsertsInCommit3 += records.size();
}
}
assertEquals(numTotalUpdatesInCommit3, inserts2.size(), "Total updates in commit3 must add up");
assertEquals(numTotalInsertsInCommit3, keys3.size(), "Total inserts in commit3 must add up");
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testDeduplication.
/**
* Test Deduplication Logic for write function.
*
* @param writeFn One of HoddieWriteClient non-prepped write APIs
* @throws Exception in case of failure
*/
private void testDeduplication(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean populateMetaFields) throws Exception {
String newCommitTime = "001";
String recordKey = UUID.randomUUID().toString();
HoodieKey keyOne = new HoodieKey(recordKey, "2018-01-01");
HoodieRecord<RawTripTestPayload> recordOne = new HoodieAvroRecord(keyOne, dataGen.generateRandomValue(keyOne, newCommitTime));
HoodieKey keyTwo = new HoodieKey(recordKey, "2018-02-01");
HoodieRecord recordTwo = new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime));
// Same key and partition as keyTwo
HoodieRecord recordThree = new HoodieAvroRecord(keyTwo, dataGen.generateRandomValue(keyTwo, newCommitTime));
HoodieData<HoodieRecord<RawTripTestPayload>> records = HoodieJavaRDD.of(jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1));
// Global dedup should be done based on recordKey only
HoodieIndex index = mock(HoodieIndex.class);
when(index.isGlobal()).thenReturn(true);
List<HoodieRecord<RawTripTestPayload>> dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1).collectAsList();
assertEquals(1, dedupedRecs.size());
assertEquals(dedupedRecs.get(0).getPartitionPath(), recordThree.getPartitionPath());
assertNodupesWithinPartition(dedupedRecs);
// non-Global dedup should be done based on both recordKey and partitionPath
index = mock(HoodieIndex.class);
when(index.isGlobal()).thenReturn(false);
dedupedRecs = HoodieWriteHelper.newInstance().deduplicateRecords(records, index, 1).collectAsList();
assertEquals(2, dedupedRecs.size());
assertNodupesWithinPartition(dedupedRecs);
// Perform write-action and check
JavaRDD<HoodieRecord> recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1);
HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).combineInput(true, true);
addConfigsForPopulateMetaFields(configBuilder, populateMetaFields);
try (SparkRDDWriteClient client = getHoodieWriteClient(configBuilder.build())) {
client.startCommitWithTime(newCommitTime);
List<WriteStatus> statuses = writeFn.apply(client, recordList, newCommitTime).collect();
assertNoWriteErrors(statuses);
assertEquals(2, statuses.size());
assertNodupesInPartition(statuses.stream().map(WriteStatus::getWrittenRecords).flatMap(Collection::stream).collect(Collectors.toList()));
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testConsistencyCheckDuringFinalize.
/**
* Tests behavior of committing only when consistency is verified.
*/
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testConsistencyCheckDuringFinalize(boolean enableOptimisticConsistencyGuard) throws Exception {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
String instantTime = "000";
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
Pair<Path, JavaRDD<WriteStatus>> result = testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard);
// Delete orphan marker and commit should succeed
metaClient.getFs().delete(result.getKey(), false);
if (!enableOptimisticConsistencyGuard) {
assertTrue(client.commit(instantTime, result.getRight()), "Commit should succeed");
assertTrue(testTable.commitExists(instantTime), "After explicit commit, commit file should be created");
// Marker directory must be removed
assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime))));
} else {
// with optimistic, first client.commit should have succeeded.
assertTrue(testTable.commitExists(instantTime), "After explicit commit, commit file should be created");
// Marker directory must be removed
assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime))));
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testRollbackFailedCommitsToggleCleaningPolicy.
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFields) throws Exception {
HoodieTestUtils.init(hadoopConf, basePath);
HoodieFailedWritesCleaningPolicy cleaningPolicy = EAGER;
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
// Perform 1 successful writes to table
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
// Perform 1 failed writes to table
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
client.close();
// Toggle cleaning policy to LAZY
cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
// Perform 2 failed writes to table
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
client.close();
// Await till enough time passes such that the 2 failed commits heartbeats are expired
boolean conditionMet = false;
while (!conditionMet) {
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("400");
Thread.sleep(2000);
}
client.clean();
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 3);
// Perform 2 failed commits
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "500", "400", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "600", "500", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
client.close();
// Toggle cleaning policy to EAGER
cleaningPolicy = EAGER;
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
client.startCommit();
timeline = metaClient.getActiveTimeline().reload();
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 5);
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 1);
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testMultiOperationsPerCommit.
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testMultiOperationsPerCommit(boolean populateMetaFields) throws IOException {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false).withAllowMultiWriteOnSameInstant(true);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
String firstInstantTime = "0000";
client.startCommitWithTime(firstInstantTime);
int numRecords = 200;
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(dataGen.generateInserts(firstInstantTime, numRecords), 1);
JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, firstInstantTime);
assertTrue(client.commit(firstInstantTime, result), "Commit should succeed");
assertTrue(testTable.commitExists(firstInstantTime), "After explicit commit, commit file should be created");
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
}
assertEquals(numRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + numRecords + " records");
String nextInstantTime = "0001";
client.startCommitWithTime(nextInstantTime);
JavaRDD<HoodieRecord> updateRecords = jsc.parallelize(dataGen.generateUpdates(nextInstantTime, numRecords), 1);
JavaRDD<HoodieRecord> insertRecords = jsc.parallelize(dataGen.generateInserts(nextInstantTime, numRecords), 1);
JavaRDD<WriteStatus> inserts = client.bulkInsert(insertRecords, nextInstantTime);
JavaRDD<WriteStatus> upserts = client.upsert(updateRecords, nextInstantTime);
assertTrue(client.commit(nextInstantTime, inserts.union(upserts)), "Commit should succeed");
assertTrue(testTable.commitExists(firstInstantTime), "After explicit commit, commit file should be created");
int totalRecords = 2 * numRecords;
assertEquals(totalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + totalRecords + " records");
}
Aggregations