use of org.apache.hudi.common.model.HoodieFileGroupId in project hudi by apache.
the class TestClusteringUtils method generateFileSlice.
private FileSlice generateFileSlice(String partitionPath, String fileId, String baseInstant) {
FileSlice fs = new FileSlice(new HoodieFileGroupId(partitionPath, fileId), baseInstant);
fs.setBaseFile(new HoodieBaseFile(FSUtils.makeDataFileName(baseInstant, "1-0-1", fileId)));
return fs;
}
use of org.apache.hudi.common.model.HoodieFileGroupId in project hudi by apache.
the class CompactionTestUtils method setupAndValidateCompactionOperations.
public static Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> setupAndValidateCompactionOperations(HoodieTableMetaClient metaClient, boolean inflight, int numEntriesInPlan1, int numEntriesInPlan2, int numEntriesInPlan3, int numEntriesInPlan4) throws IOException {
HoodieCompactionPlan plan1 = createCompactionPlan(metaClient, "000", "001", numEntriesInPlan1, true, true);
HoodieCompactionPlan plan2 = createCompactionPlan(metaClient, "002", "003", numEntriesInPlan2, false, true);
HoodieCompactionPlan plan3 = createCompactionPlan(metaClient, "004", "005", numEntriesInPlan3, true, false);
HoodieCompactionPlan plan4 = createCompactionPlan(metaClient, "006", "007", numEntriesInPlan4, false, false);
if (inflight) {
scheduleInflightCompaction(metaClient, "001", plan1);
scheduleInflightCompaction(metaClient, "003", plan2);
scheduleInflightCompaction(metaClient, "005", plan3);
scheduleInflightCompaction(metaClient, "007", plan4);
} else {
scheduleCompaction(metaClient, "001", plan1);
scheduleCompaction(metaClient, "003", plan2);
scheduleCompaction(metaClient, "005", plan3);
scheduleCompaction(metaClient, "007", plan4);
}
createDeltaCommit(metaClient, "000");
createDeltaCommit(metaClient, "002");
createDeltaCommit(metaClient, "004");
createDeltaCommit(metaClient, "006");
Map<String, String> baseInstantsToCompaction = new HashMap<String, String>() {
{
put("000", "001");
put("002", "003");
put("004", "005");
put("006", "007");
}
};
List<Integer> expectedNumEntries = Arrays.asList(numEntriesInPlan1, numEntriesInPlan2, numEntriesInPlan3, numEntriesInPlan4);
List<HoodieCompactionPlan> plans = CollectionUtils.createImmutableList(plan1, plan2, plan3, plan4);
IntStream.range(0, 4).boxed().forEach(idx -> {
if (expectedNumEntries.get(idx) > 0) {
assertEquals(expectedNumEntries.get(idx).longValue(), plans.get(idx).getOperations().size(), "check if plan " + idx + " has exp entries");
} else {
assertNull(plans.get(idx).getOperations(), "Plan " + idx + " has null ops");
}
});
metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getBasePath()).setLoadActiveTimelineOnLoad(true).build();
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> pendingCompactionMap = CompactionUtils.getAllPendingCompactionOperations(metaClient);
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> expPendingCompactionMap = generateExpectedCompactionOperations(Arrays.asList(plan1, plan2, plan3, plan4), baseInstantsToCompaction);
// Ensure Compaction operations are fine.
assertEquals(expPendingCompactionMap, pendingCompactionMap);
return expPendingCompactionMap;
}
use of org.apache.hudi.common.model.HoodieFileGroupId in project hudi by apache.
the class BootstrapCommand method showBootstrapIndexMapping.
@CliCommand(value = "bootstrap index showmapping", help = "Show bootstrap index mapping")
public String showBootstrapIndexMapping(@CliOption(key = { "partitionPath" }, unspecifiedDefaultValue = "", help = "A valid partition path") String partitionPath, @CliOption(key = { "fileIds" }, unspecifiedDefaultValue = "", help = "Valid fileIds split by comma") String fileIds, @CliOption(key = { "limit" }, unspecifiedDefaultValue = "-1", help = "Limit rows to be displayed") Integer limit, @CliOption(key = { "sortBy" }, unspecifiedDefaultValue = "", help = "Sorting Field") final String sortByField, @CliOption(key = { "desc" }, unspecifiedDefaultValue = "false", help = "Ordering") final boolean descending, @CliOption(key = { "headeronly" }, unspecifiedDefaultValue = "false", help = "Print Header Only") final boolean headerOnly) {
if (partitionPath.isEmpty() && !fileIds.isEmpty()) {
throw new IllegalStateException("PartitionPath is mandatory when passing fileIds.");
}
BootstrapIndex.IndexReader indexReader = createBootstrapIndexReader();
List<String> indexedPartitions = indexReader.getIndexedPartitionPaths();
if (!partitionPath.isEmpty() && !indexedPartitions.contains(partitionPath)) {
return partitionPath + " is not an valid indexed partition";
}
List<BootstrapFileMapping> mappingList = new ArrayList<>();
if (!fileIds.isEmpty()) {
List<HoodieFileGroupId> fileGroupIds = Arrays.stream(fileIds.split(",")).map(fileId -> new HoodieFileGroupId(partitionPath, fileId)).collect(Collectors.toList());
mappingList.addAll(indexReader.getSourceFileMappingForFileIds(fileGroupIds).values());
} else if (!partitionPath.isEmpty()) {
mappingList.addAll(indexReader.getSourceFileMappingForPartition(partitionPath));
} else {
for (String part : indexedPartitions) {
mappingList.addAll(indexReader.getSourceFileMappingForPartition(part));
}
}
final List<Comparable[]> rows = convertBootstrapSourceFileMapping(mappingList);
final TableHeader header = new TableHeader().addTableHeaderField("Hudi Partition").addTableHeaderField("FileId").addTableHeaderField("Source File Base Path").addTableHeaderField("Source File Partition").addTableHeaderField("Source File Path");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
use of org.apache.hudi.common.model.HoodieFileGroupId in project hudi by apache.
the class TestHoodieBackedMetadata method testReattemptOfFailedClusteringCommit.
/**
* Lets say clustering commit succeeded in metadata table, but failed before committing to datatable.
* Next time, when clustering kicks in, hudi will rollback pending clustering (in data table) and re-attempt the clustering with same
* instant time. So, this test ensures the 2nd attempt succeeds with metadata enabled.
* This is applicable to any table service where instant time is fixed. So, how many ever times the operation fails, re attempt will
* be made with same commit time.
* Tests uses clustering to test out the scenario.
*/
@Test
public void testReattemptOfFailedClusteringCommit() throws Exception {
tableType = HoodieTableType.COPY_ON_WRITE;
init(tableType);
context = new HoodieSparkEngineContext(jsc);
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false);
SparkRDDWriteClient client = getHoodieWriteClient(config);
// Write 1 (Bulk insert)
String newCommitTime = "0000001";
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
client.startCommitWithTime(newCommitTime);
List<WriteStatus> writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Write 2 (inserts)
newCommitTime = "0000002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringSortColumns("_row_key").withInlineClustering(true).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
HoodieWriteConfig newWriteConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER).withAutoCommit(false).withClusteringConfig(clusteringConfig).build();
// trigger clustering
SparkRDDWriteClient newClient = getHoodieWriteClient(newWriteConfig);
String clusteringCommitTime = newClient.scheduleClustering(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = newClient.cluster(clusteringCommitTime, true);
// collect replaceFileIds for validation later.
Set<HoodieFileGroupId> replacedFileIds = new HashSet<>();
clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
// trigger new write to mimic other writes succeeding before re-attempt.
newCommitTime = "0000003";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// manually remove clustering completed instant from .hoodie folder and to mimic succeeded clustering in metadata table, but failed in data table.
FileCreateUtils.deleteReplaceCommit(basePath, clusteringCommitTime);
HoodieWriteMetadata<JavaRDD<WriteStatus>> updatedClusterMetadata = newClient.cluster(clusteringCommitTime, true);
metaClient.reloadActiveTimeline();
Set<HoodieFileGroupId> updatedReplacedFileIds = new HashSet<>();
updatedClusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> updatedReplacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
assertEquals(replacedFileIds, updatedReplacedFileIds);
validateMetadata(client);
}
use of org.apache.hudi.common.model.HoodieFileGroupId in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testClustering.
private void testClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields, boolean completeClustering, boolean assertSameFileIds, String validatorClasses, String sqlQueryForEqualityValidation, String sqlQueryForSingleResultValidation, Pair<Pair<List<HoodieRecord>, List<String>>, Set<HoodieFileGroupId>> allRecords) throws IOException {
HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(false).withClusteringConfig(clusteringConfig).withProps(getPropertiesForKeyGen()).build();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = performClustering(clusteringConfig, populateMetaFields, completeClustering, validatorClasses, sqlQueryForEqualityValidation, sqlQueryForSingleResultValidation, allRecords.getLeft());
if (assertSameFileIds) {
Set<HoodieFileGroupId> replacedFileIds = clusterMetadata.getWriteStats().get().stream().map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
Set<HoodieFileGroupId> insertedFileIds = allRecords.getRight();
assertEquals(insertedFileIds, replacedFileIds);
}
if (completeClustering) {
String clusteringCommitTime = metaClient.reloadActiveTimeline().getCompletedReplaceTimeline().getReverseOrderedInstants().findFirst().get().getTimestamp();
verifyRecordsWritten(clusteringCommitTime, populateMetaFields, allRecords.getLeft().getLeft(), clusterMetadata.getWriteStatuses().collect(), config);
}
}
Aggregations