use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpsertsUpdatePartitionPath.
/**
* This test ensures in a global bloom when update partition path is set to true in config, if an incoming record has mismatched partition
* compared to whats in storage, then appropriate actions are taken. i.e. old record is deleted in old partition and new one is inserted
* in the new partition.
* test structure:
* 1. insert 1 batch
* 2. insert 2nd batch with larger no of records so that a new file group is created for partitions
* 3. issue upserts to records from batch 1 with different partition path. This should ensure records from batch 1 are deleted and new
* records are upserted to the new partition
*
* @param indexType index type to be tested for
* @param config instance of {@link HoodieWriteConfig} to use
* @param writeFn write function to be used for testing
*/
private void testUpsertsUpdatePartitionPath(IndexType indexType, HoodieWriteConfig config, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
// instantiate client
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder().withProps(config.getProps()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10000).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withBloomIndexUpdatePartitionPath(true).withGlobalSimpleIndexUpdatePartitionPath(true).build()).withTimelineLayoutVersion(VERSION_0).build();
HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_0).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
// Set rollback to LAZY so no inflights are deleted
hoodieWriteConfig.getProps().put(HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY.key(), HoodieFailedWritesCleaningPolicy.LAZY.name());
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
// Write 1
String newCommitTime = "001";
int numRecords = 10;
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
Set<Pair<String, String>> expectedPartitionPathRecKeyPairs = new HashSet<>();
// populate expected partition path and record keys
for (HoodieRecord rec : records) {
expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
}
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime);
result.collect();
// Check the entire dataset has all records
String[] fullPartitionPaths = getFullPartitionPaths();
assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
// verify one basefile per partition
String[] fullExpectedPartitionPaths = getFullPartitionPaths(expectedPartitionPathRecKeyPairs.stream().map(Pair::getLeft).toArray(String[]::new));
Map<String, Long> baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullExpectedPartitionPaths);
for (Map.Entry<String, Long> entry : baseFileCounts.entrySet()) {
assertEquals(1, entry.getValue());
}
assertTrue(baseFileCounts.entrySet().stream().allMatch(entry -> entry.getValue() == 1));
// Write 2
newCommitTime = "002";
// so that a new file id is created
numRecords = 20;
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> recordsSecondBatch = dataGen.generateInserts(newCommitTime, numRecords);
// populate expected partition path and record keys
for (HoodieRecord rec : recordsSecondBatch) {
expectedPartitionPathRecKeyPairs.add(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
}
writeRecords = jsc.parallelize(recordsSecondBatch, 1);
result = writeFn.apply(client, writeRecords, newCommitTime);
result.collect();
// Check the entire dataset has all records
fullPartitionPaths = getFullPartitionPaths();
assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
// verify that there are more than 1 basefiles per partition
// we can't guarantee randomness in partitions where records are distributed. So, verify atleast one partition has more than 1 basefile.
baseFileCounts = getBaseFileCountsForPaths(basePath, fs, fullPartitionPaths);
assertTrue(baseFileCounts.entrySet().stream().filter(entry -> entry.getValue() > 1).count() >= 1, "At least one partition should have more than 1 base file after 2nd batch of writes");
// Write 3 (upserts to records from batch 1 with diff partition path)
newCommitTime = "003";
// update to diff partition paths
List<HoodieRecord> recordsToUpsert = new ArrayList<>();
for (HoodieRecord rec : records) {
// remove older entry from expected partition path record key pairs
expectedPartitionPathRecKeyPairs.remove(Pair.of(rec.getPartitionPath(), rec.getRecordKey()));
String partitionPath = rec.getPartitionPath();
String newPartitionPath = null;
if (partitionPath.equalsIgnoreCase(DEFAULT_FIRST_PARTITION_PATH)) {
newPartitionPath = DEFAULT_SECOND_PARTITION_PATH;
} else if (partitionPath.equalsIgnoreCase(DEFAULT_SECOND_PARTITION_PATH)) {
newPartitionPath = DEFAULT_THIRD_PARTITION_PATH;
} else if (partitionPath.equalsIgnoreCase(DEFAULT_THIRD_PARTITION_PATH)) {
newPartitionPath = DEFAULT_FIRST_PARTITION_PATH;
} else {
throw new IllegalStateException("Unknown partition path " + rec.getPartitionPath());
}
recordsToUpsert.add(new HoodieAvroRecord(new HoodieKey(rec.getRecordKey(), newPartitionPath), (HoodieRecordPayload) rec.getData()));
// populate expected partition path and record keys
expectedPartitionPathRecKeyPairs.add(Pair.of(newPartitionPath, rec.getRecordKey()));
}
writeRecords = jsc.parallelize(recordsToUpsert, 1);
result = writeFn.apply(client, writeRecords, newCommitTime);
result.collect();
// Check the entire dataset has all records
fullPartitionPaths = getFullPartitionPaths();
assertPartitionPathRecordKeys(expectedPartitionPathRecKeyPairs, fullPartitionPaths);
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method performClustering.
private HoodieWriteMetadata<JavaRDD<WriteStatus>> performClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields, boolean completeClustering, String validatorClasses, String sqlQueryForEqualityValidation, String sqlQueryForSingleResultValidation, Pair<List<HoodieRecord>, List<String>> allRecords) throws IOException {
HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder().withPreCommitValidator(StringUtils.nullToEmpty(validatorClasses)).withPrecommitValidatorEqualitySqlQueries(sqlQueryForEqualityValidation).withPrecommitValidatorSingleResultSqlQueries(sqlQueryForSingleResultValidation).build();
HoodieWriteConfig config = getConfigBuilder().withAutoCommit(false).withPreCommitValidatorConfig(validatorConfig).withProps(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).withClusteringConfig(clusteringConfig).build();
// create client with new config.
SparkRDDWriteClient client = getHoodieWriteClient(config);
String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = client.cluster(clusteringCommitTime, completeClustering);
if (config.isPreserveHoodieCommitMetadataForClustering() && config.populateMetaFields()) {
verifyRecordsWrittenWithPreservedMetadata(new HashSet<>(allRecords.getRight()), allRecords.getLeft(), clusterMetadata.getWriteStatuses().collect());
} else {
verifyRecordsWritten(clusteringCommitTime, populateMetaFields, allRecords.getLeft(), clusterMetadata.getWriteStatuses().collect(), config);
}
Set<HoodieFileGroupId> replacedFileIds = new HashSet<>();
clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
return clusterMetadata;
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testConsistencyCheck.
private Pair<Path, JavaRDD<WriteStatus>> testConsistencyCheck(HoodieTableMetaClient metaClient, String instantTime, boolean enableOptimisticConsistencyGuard) throws Exception {
HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? (getConfigBuilder().withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build()) : (getConfigBuilder().withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).withOptimisticConsistencyGuardSleepTimeMs(1).build()).build());
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
client.startCommitWithTime(instantTime);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(dataGen.generateInserts(instantTime, 200), 1);
JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, instantTime);
result.collect();
// Create a dummy marker file to simulate the case that a marker file was created without data file.
// This should fail the commit
String partitionPath;
String markerFolderPath = metaClient.getMarkerFolderPath(instantTime);
if (cfg.getMarkersType() == MarkerType.TIMELINE_SERVER_BASED) {
String markerName = MarkerUtils.readTimelineServerBasedMarkersFromFileSystem(markerFolderPath, fs, context, 1).values().stream().flatMap(Collection::stream).findFirst().get();
partitionPath = new Path(markerFolderPath, markerName).getParent().toString();
} else {
partitionPath = Arrays.stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", markerFolderPath)), path -> path.toString().contains(HoodieTableMetaClient.MARKER_EXTN))).limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0);
}
Option<Path> markerFilePath = WriteMarkersFactory.get(cfg.getMarkersType(), getHoodieTable(metaClient, cfg), instantTime).create(partitionPath, FSUtils.makeDataFileName(instantTime, "1-0-1", UUID.randomUUID().toString()), IOType.MERGE);
LOG.info("Created a dummy marker path=" + markerFilePath.get());
if (!enableOptimisticConsistencyGuard) {
Exception e = assertThrows(HoodieCommitException.class, () -> {
client.commit(instantTime, result);
}, "Commit should fail due to consistency check");
assertTrue(e.getCause() instanceof HoodieIOException);
} else {
// with optimistic CG, commit should succeed
client.commit(instantTime, result);
}
return Pair.of(markerFilePath.get(), result);
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testRollbackAfterConsistencyCheckFailureUsingFileList.
private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard, boolean populateMetaFields) throws Exception {
String instantTime = "00000000000010";
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
Properties properties = new Properties();
if (!populateMetaFields) {
properties = getPropertiesForKeyGen();
}
HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build() : getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).withOptimisticConsistencyGuardSleepTimeMs(1).build()).withProperties(properties).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard);
if (!enableOptimisticConsistencyGuard) {
// Rollback of this commit should succeed with FailSafeCG
client.rollback(instantTime);
assertFalse(testTable.commitExists(instantTime), "After explicit rollback, commit file should not be present");
// Marker directory must be removed after rollback
assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime))));
} else {
// if optimistic CG is enabled, commit should have succeeded.
assertTrue(testTable.commitExists(instantTime), "With optimistic CG, first commit should succeed. commit file should be present");
// Marker directory must be removed after rollback
assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime))));
if (rollbackUsingMarkers) {
// rollback of a completed commit should fail if marked based rollback is used.
try {
client.rollback(instantTime);
fail("Rollback of completed commit should throw exception");
} catch (HoodieRollbackException e) {
// ignore
}
} else {
// rollback of a completed commit should succeed if using list based rollback
client.rollback(instantTime);
assertFalse(testTable.commitExists(instantTime), "After explicit rollback, commit file should not be present");
}
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method verifyDeletePartitionsHandling.
/**
* 1) Do write1 (upsert) with 'batch1RecordsCount' number of records for first partition.
* 2) Do write2 (upsert) with 'batch2RecordsCount' number of records for second partition.
* 3) Do write3 (upsert) with 'batch3RecordsCount' number of records for third partition.
* 4) delete first partition and check result.
* 5) delete second and third partition and check result.
*/
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount, boolean populateMetaFields) throws Exception {
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator();
// Do Inserts for DEFAULT_FIRST_PARTITION_PATH
String commitTime1 = "001";
Set<String> batch1Buckets = this.insertPartitionRecordsWithCommit(client, batch1RecordsCount, commitTime1, DEFAULT_FIRST_PARTITION_PATH);
// Do Inserts for DEFAULT_SECOND_PARTITION_PATH
String commitTime2 = "002";
Set<String> batch2Buckets = this.insertPartitionRecordsWithCommit(client, batch2RecordsCount, commitTime2, DEFAULT_SECOND_PARTITION_PATH);
// Do Inserts for DEFAULT_THIRD_PARTITION_PATH
String commitTime3 = "003";
Set<String> batch3Buckets = this.insertPartitionRecordsWithCommit(client, batch3RecordsCount, commitTime3, DEFAULT_THIRD_PARTITION_PATH);
// delete DEFAULT_FIRST_PARTITION_PATH
String commitTime4 = "004";
Set<String> deletePartitionReplaceFileIds1 = deletePartitionWithCommit(client, commitTime4, Arrays.asList(DEFAULT_FIRST_PARTITION_PATH));
assertEquals(batch1Buckets, deletePartitionReplaceFileIds1);
List<HoodieBaseFile> baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_FIRST_PARTITION_PATH));
assertEquals(0, baseFiles.size());
baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_SECOND_PARTITION_PATH));
assertTrue(baseFiles.size() > 0);
baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_THIRD_PARTITION_PATH));
assertTrue(baseFiles.size() > 0);
// delete DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH
String commitTime5 = "005";
Set<String> deletePartitionReplaceFileIds2 = deletePartitionWithCommit(client, commitTime5, Arrays.asList(DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH));
Set<String> expectedFileId = new HashSet<>();
expectedFileId.addAll(batch2Buckets);
expectedFileId.addAll(batch3Buckets);
assertEquals(expectedFileId, deletePartitionReplaceFileIds2);
baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_FIRST_PARTITION_PATH), String.format("%s/%s/*", basePath, DEFAULT_SECOND_PARTITION_PATH), String.format("%s/%s/*", basePath, DEFAULT_THIRD_PARTITION_PATH));
assertEquals(0, baseFiles.size());
}
Aggregations