use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method performClustering.
private HoodieWriteMetadata<JavaRDD<WriteStatus>> performClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields, boolean completeClustering, String validatorClasses, String sqlQueryForEqualityValidation, String sqlQueryForSingleResultValidation, Pair<List<HoodieRecord>, List<String>> allRecords) throws IOException {
HoodiePreCommitValidatorConfig validatorConfig = HoodiePreCommitValidatorConfig.newBuilder().withPreCommitValidator(StringUtils.nullToEmpty(validatorClasses)).withPrecommitValidatorEqualitySqlQueries(sqlQueryForEqualityValidation).withPrecommitValidatorSingleResultSqlQueries(sqlQueryForSingleResultValidation).build();
HoodieWriteConfig config = getConfigBuilder().withAutoCommit(false).withPreCommitValidatorConfig(validatorConfig).withProps(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).withClusteringConfig(clusteringConfig).build();
// create client with new config.
SparkRDDWriteClient client = getHoodieWriteClient(config);
String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = client.cluster(clusteringCommitTime, completeClustering);
if (config.isPreserveHoodieCommitMetadataForClustering() && config.populateMetaFields()) {
verifyRecordsWrittenWithPreservedMetadata(new HashSet<>(allRecords.getRight()), allRecords.getLeft(), clusterMetadata.getWriteStatuses().collect());
} else {
verifyRecordsWritten(clusteringCommitTime, populateMetaFields, allRecords.getLeft(), clusterMetadata.getWriteStatuses().collect(), config);
}
Set<HoodieFileGroupId> replacedFileIds = new HashSet<>();
clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
return clusterMetadata;
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testConsistencyCheck.
private Pair<Path, JavaRDD<WriteStatus>> testConsistencyCheck(HoodieTableMetaClient metaClient, String instantTime, boolean enableOptimisticConsistencyGuard) throws Exception {
HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? (getConfigBuilder().withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build()) : (getConfigBuilder().withAutoCommit(false).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).withOptimisticConsistencyGuardSleepTimeMs(1).build()).build());
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
client.startCommitWithTime(instantTime);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(dataGen.generateInserts(instantTime, 200), 1);
JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, instantTime);
result.collect();
// Create a dummy marker file to simulate the case that a marker file was created without data file.
// This should fail the commit
String partitionPath;
String markerFolderPath = metaClient.getMarkerFolderPath(instantTime);
if (cfg.getMarkersType() == MarkerType.TIMELINE_SERVER_BASED) {
String markerName = MarkerUtils.readTimelineServerBasedMarkersFromFileSystem(markerFolderPath, fs, context, 1).values().stream().flatMap(Collection::stream).findFirst().get();
partitionPath = new Path(markerFolderPath, markerName).getParent().toString();
} else {
partitionPath = Arrays.stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", markerFolderPath)), path -> path.toString().contains(HoodieTableMetaClient.MARKER_EXTN))).limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0);
}
Option<Path> markerFilePath = WriteMarkersFactory.get(cfg.getMarkersType(), getHoodieTable(metaClient, cfg), instantTime).create(partitionPath, FSUtils.makeDataFileName(instantTime, "1-0-1", UUID.randomUUID().toString()), IOType.MERGE);
LOG.info("Created a dummy marker path=" + markerFilePath.get());
if (!enableOptimisticConsistencyGuard) {
Exception e = assertThrows(HoodieCommitException.class, () -> {
client.commit(instantTime, result);
}, "Commit should fail due to consistency check");
assertTrue(e.getCause() instanceof HoodieIOException);
} else {
// with optimistic CG, commit should succeed
client.commit(instantTime, result);
}
return Pair.of(markerFilePath.get(), result);
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testInflightClusteringRollbackWhenUpdatesAllowed.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testInflightClusteringRollbackWhenUpdatesAllowed(boolean rollbackPendingClustering) throws Exception {
// setup clustering config with update strategy to allow updates during ingestion
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withClusteringUpdatesStrategy("org.apache.hudi.client.clustering.update.strategy.SparkAllowUpdateStrategy").withRollbackPendingClustering(rollbackPendingClustering).withInlineClustering(true).withInlineClusteringNumCommits(1).build();
// start clustering, but don't commit keep it inflight
List<HoodieRecord> allRecords = testInsertAndClustering(clusteringConfig, true, false);
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans = ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
assertEquals(1, pendingClusteringPlans.size());
HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft();
assertEquals(pendingClusteringInstant.getState(), INFLIGHT);
// make an update to a filegroup within the partition that is pending clustering
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(EAGER);
addConfigsForPopulateMetaFields(cfgBuilder, true);
cfgBuilder.withClusteringConfig(clusteringConfig);
HoodieWriteConfig config = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(config);
String commitTime = HoodieActiveTimeline.createNewInstantTime();
allRecords.addAll(dataGen.generateUpdates(commitTime, 200));
writeAndVerifyBatch(client, allRecords, commitTime, true);
// verify inflight clustering was rolled back
metaClient.reloadActiveTimeline();
pendingClusteringPlans = ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
assertEquals(config.isRollbackPendingClustering() ? 0 : 1, pendingClusteringPlans.size());
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testPendingClusteringRollback.
@Test
public void testPendingClusteringRollback() throws Exception {
boolean populateMetaFields = true;
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build();
// start clustering, but don't commit
List<HoodieRecord> allRecords = testInsertAndClustering(clusteringConfig, populateMetaFields, false);
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans = ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
assertEquals(1, pendingClusteringPlans.size());
HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft();
// complete another commit after pending clustering
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(EAGER);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig config = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator();
String commitTime = HoodieActiveTimeline.createNewInstantTime();
allRecords.addAll(dataGen.generateInserts(commitTime, 200));
assertThrows(HoodieUpsertException.class, () -> writeAndVerifyBatch(client, allRecords, commitTime, populateMetaFields));
// verify pending clustering can be rolled back (even though there is a completed commit greater than pending clustering)
client.rollback(pendingClusteringInstant.getTimestamp());
metaClient.reloadActiveTimeline();
// verify there are no pending clustering instants
assertEquals(0, ClusteringUtils.getAllPendingClusteringPlans(metaClient).count());
// delete rollback.completed instant to mimic failed rollback of clustering. and then trigger rollback of clustering again. same rollback instant should be used.
HoodieInstant rollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
FileCreateUtils.deleteRollbackCommit(metaClient.getBasePath(), rollbackInstant.getTimestamp());
metaClient.reloadActiveTimeline();
// create replace commit requested meta file so that rollback will not throw FileNotFoundException
// create file slice with instantTime 001 and build clustering plan including this created 001 file slice.
HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, pendingClusteringInstant.getTimestamp(), "1");
// create requested replace commit
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build();
FileCreateUtils.createRequestedReplaceCommit(metaClient.getBasePath(), pendingClusteringInstant.getTimestamp(), Option.of(requestedReplaceMetadata));
// trigger clustering again. no new rollback instants should be generated.
try {
client.cluster(pendingClusteringInstant.getTimestamp(), false);
// new replace commit metadata generated is fake one. so, clustering will fail. but the intention of test is ot check for duplicate rollback instants.
} catch (Exception e) {
// ignore.
}
metaClient.reloadActiveTimeline();
// verify that there is no new rollback instant generated
HoodieInstant newRollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
assertEquals(rollbackInstant.getTimestamp(), newRollbackInstant.getTimestamp());
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieIndex method testTagLocationAndDuplicateUpdate.
@ParameterizedTest
@MethodSource("indexTypeParams")
public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean populateMetaFields, boolean enableMetadataIndex) throws Exception {
setUp(indexType, populateMetaFields, enableMetadataIndex);
String newCommitTime = "001";
int totalRecords = 10 + random.nextInt(20);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
writeClient.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords, hoodieTable);
// Duplicate upsert and ensure correctness is maintained
// We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not
// recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent
// upsert will not run into conflicts.
metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight"));
writeClient.upsert(writeRecords, newCommitTime);
Assertions.assertNoWriteErrors(writeStatues.collect());
// Now commit this & update location of records inserted and validate no errors
writeClient.commit(newCommitTime, writeStatues);
// Now tagLocation for these records, hbaseIndex should tag them correctly
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> javaRDD = tagLocation(index, writeRecords, hoodieTable);
Map<String, String> recordKeyToPartitionPathMap = new HashMap();
List<HoodieRecord> hoodieRecords = writeRecords.collect();
hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));
assertEquals(totalRecords, javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size());
assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count());
assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch"));
JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = getRecordLocations(hoodieKeyJavaRDD, hoodieTable);
List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
assertEquals(totalRecords, recordLocations.collect().size());
assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));
}
Aggregations