use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpsertsUpdatePartitionPath.
/**
* This test ensures in a global bloom when update partition path is set to true in config, if an incoming record has mismatched partition
* compared to whats in storage, then appropriate actions are taken. i.e. old record is deleted in old partition and new one is inserted
* in the new partition.
* test structure:
* 1. insert 1 batch
* 2. insert 2nd batch with larger no of records so that a new file group is created for partitions
* 3. issue upserts to records from batch 1 with different partition path. This should ensure records from batch 1 are deleted and new
* records are upserted to the new partition
*
* @param indexType index type to be tested for
* @param config instance of {@link HoodieWriteConfig} to use
* @param writeFn write function to be used for testing
*/
private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConfig config, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
// instantiate client
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withProps(config.getProps()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10000).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withBloomIndexUpdatePartitionPath(true).withGlobalSimpleIndexUpdatePartitionPath(true).build()).withTimelineLayoutVersion(VERSION_0).build();
HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_0).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
// Set rollback to LAZY so no inflights are deleted
hoodieWriteConfig.getProps().put(HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY.key(), HoodieFailedWritesCleaningPolicy.LAZY.name());
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
// Write 1
String newCommitTime = "001";
int numRecords = 10;
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
Set<Pair<String, String>> expectedPartitionPathRecKeyPairs = new HashSet<>();
// populate expected partition path and record keys
for (HoodieRecord rec : records) {
expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
}
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime);
result.collect();
// Check the entire dataset has all records
String[] fullPartitionPaths = getFullPartitionPaths();
assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
// verify one basefile per partition
String[] fullExpectedPartitionPaths = getFullPartitionPaths(expectedPartitionPathRecKeyPairs.stream().map(Pair::getLeft).toArray(String[]::new));
Map<String, Long> baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullExpectedPartitionPaths);
for (Map.Entry<String, Long> entry : baseFileCounts.entrySet()) {
assertEquals(1, entry.getValue());
}
assertTrue(baseFileCounts.entrySet().stream().allMatch(entry -> entry.getValue() == 1));
// Write 2
newCommitTime = "002";
// so that a new file id is created
numRecords = 20;
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> recordsSecondBatch = dataGen.generateInserts(newCommitTime, numRecords);
// populate expected partition path and record keys
for (HoodieRecord rec : recordsSecondBatch) {
expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
}
writeRecords = jsc.parallelize(recordsSecondBatch, 1);
result = writeFn.apply(client, writeRecords, newCommitTime);
result.collect();
// Check the entire dataset has all records
fullPartitionPaths = getFullPartitionPaths();
assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
// verify that there are more than 1 basefiles per partition
// we can't guarantee randomness in partitions where records are distributed. So, verify atleast one partition has more than 1 basefile.
baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullPartitionPaths);
assertTrue(baseFileCounts.entrySet().stream().filter(entry -> entry.getValue() > 1).count() >= 1, "At least one partition should have more than 1 base file after 2nd batch of writes");
// Write 3 (upserts to records from batch 1 with diff partition path)
newCommitTime = "003";
// update to diff partition paths
List<HoodieRecord> recordsToUpsert = new ArrayList<>();
for (HoodieRecord rec : records) {
// remove older entry from expected partition path record key pairs
expectedPartitionPathRecKeyPairs.remove(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
String partitionPath = rec.getPartitionPath();
String newPartitionPath = null;
if (partitionPath.equalsIgnoreCase(DEFAULT_FIRST_PARTITION_PATH)) {
newPartitionPath = DEFAULT_SECOND_PARTITION_PATH;
} else if (partitionPath.equalsIgnoreCase(DEFAULT_SECOND_PARTITION_PATH)) {
newPartitionPath = DEFAULT_THIRD_PARTITION_PATH;
} else if (partitionPath.equalsIgnoreCase(DEFAULT_THIRD_PARTITION_PATH)) {
newPartitionPath = DEFAULT_FIRST_PARTITION_PATH;
} else {
throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath());
}
recordsToUpsert.add(new HoodieAvroRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), (HoodieRecordPayload) rec.getData()));
// populate expected partition path and record keys
expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey()));
}
writeRecords = jsc.parallelize(recordsToUpsert, 1);
result = writeFn.apply(client, writeRecords, newCommitTime);
result.collect();
// Check the entire dataset has all records
fullPartitionPaths = getFullPartitionPaths();
assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH in project hudi by apache.
the class HoodieClientRollbackTestBase method insertOverwriteCommitDataWithTwoPartitions.
protected void insertOverwriteCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, boolean commitSecondInsertOverwrite) throws IOException {
// just generate two partitions
dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
/**
* Write 1 (upsert)
*/
String newCommitTime = "001";
List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
client.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
Assertions.assertNoWriteErrors(statuses.collect());
client.commit(newCommitTime, statuses);
// get fileIds written
HoodieTable table = this.getHoodieTable(metaClient, cfg);
SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
List<HoodieFileGroup> firstPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, firstPartitionCommit1FileGroups.size());
Set<String> partition1Commit1FileIds = firstPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet());
List<HoodieFileGroup> secondPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, secondPartitionCommit1FileGroups.size());
Set<String> partition2Commit1FileIds = secondPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet());
/**
* Write 2 (one insert_overwrite)
*/
String commitActionType = HoodieTimeline.REPLACE_COMMIT_ACTION;
newCommitTime = "002";
records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
writeRecords = jsc.parallelize(records, 1);
client.startCommitWithTime(newCommitTime, commitActionType);
HoodieWriteResult result = client.insertOverwrite(writeRecords, newCommitTime);
statuses = result.getWriteStatuses();
Assertions.assertNoWriteErrors(statuses.collect());
if (commitSecondInsertOverwrite) {
client.commit(newCommitTime, statuses, Option.empty(), commitActionType, result.getPartitionToReplaceFileIds());
}
metaClient.reloadActiveTimeline();
// get new fileIds written as part of insert_overwrite
fsView = getFileSystemViewWithUnCommittedSlices(metaClient);
List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).filter(fg -> !partition1Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList());
firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).filter(fg -> !partition2Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList());
secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
assertEquals(1, firstPartitionCommit2FileSlices.size());
assertEquals(1, secondPartitionCommit2FileSlices.size());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH in project hudi by apache.
the class TestMergeOnReadRollbackActionExecutor method testRollbackForCanIndexLogFile.
@Test
public void testRollbackForCanIndexLogFile() throws IOException {
cleanupResources();
setUpDFS();
// 1. prepare data and assert data result
// just generate one partitions
dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH });
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2).withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).withWriteStatusClass(MetadataMergeWriteStatus.class).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()).forTable("test-trip-table").withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withEnableBackupForRemoteFileSystemView(// Fail test if problem connecting to timeline-server
false).withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()).withRollbackUsingMarkers(false).withAutoCommit(false).build();
// 1. prepare data
new HoodieTestDataGenerator().writePartitionMetadata(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH }, basePath);
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
// Write 1 (only inserts)
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
org.apache.hudi.testutils.Assertions.assertNoWriteErrors(statuses.collect());
client.commit(newCommitTime, statuses);
// check fileSlice
HoodieTable table = this.getHoodieTable(metaClient, cfg);
SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, firstPartitionCommit2FileGroups.size());
assertEquals(1, (int) firstPartitionCommit2FileGroups.get(0).getAllFileSlices().count());
assertFalse(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getBaseFile().isPresent());
assertEquals(1, firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getLogFiles().count());
String generatedFileID = firstPartitionCommit2FileGroups.get(0).getFileGroupId().getFileId();
// check hoodieCommitMeta
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "001")).get(), HoodieCommitMetadata.class);
List<HoodieWriteStat> firstPartitionWriteStat = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH);
assertEquals(2, firstPartitionWriteStat.size());
// we have an empty writeStat for all partition
assert firstPartitionWriteStat.stream().anyMatch(wStat -> StringUtils.isNullOrEmpty(wStat.getFileId()));
// we have one non-empty writeStat which must contains update or insert
assertEquals(1, firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).count());
firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).forEach(wStat -> {
assert wStat.getNumInserts() > 0;
});
// Write 2 (inserts)
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> updateRecords = Collections.singletonList(dataGen.generateUpdateRecord(records.get(0).getKey(), newCommitTime));
List<HoodieRecord> insertRecordsInSamePartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH);
List<HoodieRecord> insertRecordsInOtherPartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_SECOND_PARTITION_PATH);
List<HoodieRecord> recordsToBeWrite = Stream.concat(Stream.concat(updateRecords.stream(), insertRecordsInSamePartition.stream()), insertRecordsInOtherPartition.stream()).collect(Collectors.toList());
writeRecords = jsc.parallelize(recordsToBeWrite, 1);
statuses = client.upsert(writeRecords, newCommitTime);
client.commit(newCommitTime, statuses);
table = this.getHoodieTable(metaClient, cfg);
commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime)).get(), HoodieCommitMetadata.class);
assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_FIRST_PARTITION_PATH);
assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_SECOND_PARTITION_PATH);
List<HoodieWriteStat> hoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH);
// Both update and insert record should enter same existing fileGroup due to small file handling
assertEquals(1, hoodieWriteStatOptionList.size());
assertEquals(generatedFileID, hoodieWriteStatOptionList.get(0).getFileId());
// check insert and update numbers
assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts());
assertEquals(1, hoodieWriteStatOptionList.get(0).getNumUpdateWrites());
List<HoodieWriteStat> secondHoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_SECOND_PARTITION_PATH);
// All insert should enter one fileGroup
assertEquals(1, secondHoodieWriteStatOptionList.size());
String fileIdInPartitionTwo = secondHoodieWriteStatOptionList.get(0).getFileId();
assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts());
// Rollback
HoodieInstant rollBackInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002");
BaseRollbackPlanActionExecutor mergeOnReadRollbackPlanActionExecutor = new BaseRollbackPlanActionExecutor(context, cfg, table, "003", rollBackInstant, false, cfg.shouldRollbackUsingMarkers());
mergeOnReadRollbackPlanActionExecutor.execute().get();
MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor(context, cfg, table, "003", rollBackInstant, true, false);
// 3. assert the rollback stat
Map<String, HoodieRollbackPartitionMetadata> rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata();
assertEquals(2, rollbackMetadata.size());
// 4. assert filegroup after rollback, and compare to the rollbackstat
// assert the first partition data and log file size
HoodieRollbackPartitionMetadata partitionMetadata = rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH);
assertTrue(partitionMetadata.getSuccessDeleteFiles().isEmpty());
assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty());
assertEquals(1, partitionMetadata.getRollbackLogFiles().size());
// assert the second partition data and log file size
partitionMetadata = rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH);
assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size());
assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty());
assertTrue(partitionMetadata.getRollbackLogFiles().isEmpty());
assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size());
}
Aggregations