use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class PartitionAwareClusteringPlanStrategy method generateClusteringPlan.
@Override
public Option<HoodieClusteringPlan> generateClusteringPlan() {
HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient();
LOG.info("Scheduling clustering for " + metaClient.getBasePath());
HoodieWriteConfig config = getWriteConfig();
List<String> partitionPaths = FSUtils.getAllPartitionPaths(getEngineContext(), config.getMetadataConfig(), metaClient.getBasePath());
// get matched partitions if set
partitionPaths = getMatchedPartitions(config, partitionPaths);
// filter the partition paths if needed to reduce list status
partitionPaths = filterPartitionPaths(partitionPaths);
if (partitionPaths.isEmpty()) {
// In case no partitions could be picked, return no clustering plan
return Option.empty();
}
List<HoodieClusteringGroup> clusteringGroups = getEngineContext().flatMap(partitionPaths, partitionPath -> {
List<FileSlice> fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList());
return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups());
}, partitionPaths.size()).stream().limit(getWriteConfig().getClusteringMaxNumGroups()).collect(Collectors.toList());
if (clusteringGroups.isEmpty()) {
LOG.info("No data available to cluster");
return Option.empty();
}
HoodieClusteringStrategy strategy = HoodieClusteringStrategy.newBuilder().setStrategyClassName(getWriteConfig().getClusteringExecutionStrategyClass()).setStrategyParams(getStrategyParams()).build();
return Option.of(HoodieClusteringPlan.newBuilder().setStrategy(strategy).setInputGroups(clusteringGroups).setExtraMetadata(getExtraMetadata()).setVersion(getPlanVersion()).setPreserveHoodieMetadata(getWriteConfig().isPreserveHoodieCommitMetadataForClustering()).build());
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class TestHoodieBackedMetadata method verifyMetadataRecordKeyExcludeFromPayloadBaseFiles.
/**
* Verify metadata table base files for the records persisted based on the config. When
* the key deduplication is enabled, the records persisted on the disk in the base file
* should have key field in the payload as empty string.
*
* @param table - Metadata table
* @param enableMetaFields - Enable meta fields
*/
private void verifyMetadataRecordKeyExcludeFromPayloadBaseFiles(HoodieTable table, boolean enableMetaFields) throws IOException {
table.getHoodieView().sync();
List<FileSlice> fileSlices = table.getSliceView().getLatestFileSlices(MetadataPartitionType.FILES.getPartitionPath()).collect(Collectors.toList());
if (!fileSlices.get(0).getBaseFile().isPresent()) {
throw new IllegalStateException("Base file not available!");
}
final HoodieBaseFile baseFile = fileSlices.get(0).getBaseFile().get();
HoodieHFileReader hoodieHFileReader = new HoodieHFileReader(context.getHadoopConf().get(), new Path(baseFile.getPath()), new CacheConfig(context.getHadoopConf().get()));
List<Pair<String, IndexedRecord>> records = hoodieHFileReader.readAllRecords();
records.forEach(entry -> {
if (enableMetaFields) {
assertNotNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
} else {
assertNull(((GenericRecord) entry.getSecond()).get(HoodieRecord.RECORD_KEY_METADATA_FIELD));
}
final String keyInPayload = (String) ((GenericRecord) entry.getSecond()).get(HoodieMetadataPayload.KEY_FIELD_NAME);
assertFalse(keyInPayload.isEmpty());
});
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpdateRejectForClustering.
@Test
public void testUpdateRejectForClustering() throws IOException {
final String testPartitionPath = "2016/09/26";
dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
Properties props = new Properties();
props.setProperty(ASYNC_CLUSTERING_ENABLE.key(), "true");
HoodieWriteConfig config = getSmallInsertWriteConfig(100, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
SparkRDDWriteClient client = getHoodieWriteClient(config);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
// 1. insert to generate 2 file group
String commitTime1 = "001";
Pair<List<WriteStatus>, List<HoodieRecord>> upsertResult = insertBatchRecords(client, commitTime1, 600, 2);
List<HoodieRecord> inserts1 = upsertResult.getValue();
List<String> fileGroupIds1 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(2, fileGroupIds1.size());
// 2. generate clustering plan for fileGroupIds1 file groups
String commitTime2 = "002";
List<List<FileSlice>> firstInsertFileSlicesList = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getAllFileSlices().collect(Collectors.toList())).collect(Collectors.toList());
List<FileSlice>[] fileSlices = (List<FileSlice>[]) firstInsertFileSlicesList.toArray(new List[firstInsertFileSlicesList.size()]);
createRequestedReplaceInstant(this.metaClient, commitTime2, fileSlices);
// 3. insert one record with no updating reject exception, and not merge the small file, just generate a new file group
String commitTime3 = "003";
insertBatchRecords(client, commitTime3, 1, 1).getKey();
List<String> fileGroupIds2 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, fileGroupIds2.size());
// 4. update one record for the clustering two file groups, throw reject update exception
String commitTime4 = "004";
client.startCommitWithTime(commitTime4);
List<HoodieRecord> insertsAndUpdates3 = new ArrayList<>();
insertsAndUpdates3.addAll(dataGen.generateUpdates(commitTime4, inserts1));
String assertMsg = String.format("Not allowed to update the clustering files in partition: %s " + "For pending clustering operations, we are not going to support update for now.", testPartitionPath);
assertThrows(HoodieUpsertException.class, () -> {
writeClient.upsert(jsc.parallelize(insertsAndUpdates3, 1), commitTime3).collect();
}, assertMsg);
// 5. insert one record with no updating reject exception, will merge the small file
String commitTime5 = "005";
List<WriteStatus> statuses = insertBatchRecords(client, commitTime5, 1, 1).getKey();
fileGroupIds2.removeAll(fileGroupIds1);
assertEquals(fileGroupIds2.get(0), statuses.get(0).getFileId());
List<String> firstInsertFileGroupIds4 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, firstInsertFileGroupIds4.size());
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class TestCompactionAdminClient method validateUnSchedulePlan.
/**
* Validate Unschedule operations.
*/
private List<Pair<HoodieLogFile, HoodieLogFile>> validateUnSchedulePlan(CompactionAdminClient client, String ingestionInstant, String compactionInstant, int numEntriesPerInstant, int expNumRenames, boolean skipUnSchedule) throws Exception {
ensureValidCompactionPlan(compactionInstant);
// Check suggested rename operations
List<Pair<HoodieLogFile, HoodieLogFile>> renameFiles = client.getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, 1, Option.empty(), false);
metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
// Log files belonging to file-slices created because of compaction request must be renamed
Set<HoodieLogFile> gotLogFilesToBeRenamed = renameFiles.stream().map(Pair::getLeft).collect(Collectors.toSet());
final HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
Set<HoodieLogFile> expLogFilesToBeRenamed = fsView.getLatestFileSlices(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]).filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)).flatMap(FileSlice::getLogFiles).collect(Collectors.toSet());
assertEquals(expLogFilesToBeRenamed, gotLogFilesToBeRenamed, "Log files belonging to file-slices created because of compaction request must be renamed");
if (skipUnSchedule) {
// Do the renaming only but do not touch the compaction plan - Needed for repair tests
renameFiles.forEach(lfPair -> {
try {
renameLogFile(metaClient, lfPair.getLeft(), lfPair.getRight());
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
} else {
validateRenameFiles(renameFiles, ingestionInstant, compactionInstant, fsView);
}
Map<String, Long> fileIdToCountsBeforeRenaming = fsView.getLatestMergedFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant).filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)).map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
// Call the main unschedule API
client.unscheduleCompactionPlan(compactionInstant, false, 1, false);
metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
final HoodieTableFileSystemView newFsView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
// Expect all file-slice whose base-commit is same as compaction commit to contain no new Log files
newFsView.getLatestFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant, true).filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)).forEach(fs -> {
assertFalse(fs.getBaseFile().isPresent(), "No Data file must be present");
assertEquals(0, fs.getLogFiles().count(), "No Log Files");
});
// Ensure same number of log-files before and after renaming per fileId
Map<String, Long> fileIdToCountsAfterRenaming = newFsView.getAllFileGroups(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]).flatMap(HoodieFileGroup::getAllFileSlices).filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)).map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
assertEquals(fileIdToCountsBeforeRenaming, fileIdToCountsAfterRenaming, "Each File Id has same number of log-files");
assertEquals(numEntriesPerInstant, fileIdToCountsAfterRenaming.size(), "Not Empty");
assertEquals(expNumRenames, renameFiles.size(), "Expected number of renames");
return renameFiles;
}
use of org.apache.hudi.common.model.FileSlice in project hudi by apache.
the class TestFileSliceMetricUtils method buildFileSlice.
private FileSlice buildFileSlice(long baseFileLen, List<Long> logFileLens) {
final String baseFilePath = ".b5068208-e1a4-11e6-bf01-fe55135034f3_20170101134598.log.1";
FileSlice slice = new FileSlice("partition_0", HoodieActiveTimeline.createNewInstantTime(), UUID.randomUUID().toString());
HoodieBaseFile baseFile = new HoodieBaseFile(baseFilePath);
baseFile.setFileLen(baseFileLen);
slice.setBaseFile(baseFile);
int logVersion = 1;
for (long logFileLen : logFileLens) {
String logFilePath = "." + UUID.randomUUID().toString() + "_20170101134598.log." + logVersion;
HoodieLogFile logFile = new HoodieLogFile(logFilePath);
logFile.setFileLen(logFileLen);
slice.addLogFile(logFile);
logVersion++;
}
return slice;
}
Aggregations