use of org.apache.hudi.avro.model.HoodieClusteringPlan in project hudi by apache.
the class TestMetadataConversionUtils method createReplace.
private void createReplace(String instantTime, WriteOperationType writeOperationType, Boolean isClustering) throws Exception {
String fileId1 = "file-1";
String fileId2 = "file-2";
// create replace instant to mark fileId1 as deleted
HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata();
Map<String, List<String>> partitionFileIds = new HashMap<>();
partitionFileIds.put(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, Arrays.asList(fileId2));
replaceMetadata.setPartitionToReplaceFileIds(partitionFileIds);
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId("file-1");
replaceMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat);
replaceMetadata.setOperationType(writeOperationType);
// some cases requestedReplaceMetadata will be null
// e.g. insert_overwrite_table or insert_overwrite without clustering
HoodieRequestedReplaceMetadata requestedReplaceMetadata = null;
HoodieCommitMetadata inflightReplaceMetadata = null;
if (isClustering) {
requestedReplaceMetadata = new HoodieRequestedReplaceMetadata();
requestedReplaceMetadata.setOperationType(writeOperationType.name());
HoodieClusteringPlan clusteringPlan = new HoodieClusteringPlan();
HoodieClusteringGroup clusteringGroup = new HoodieClusteringGroup();
HoodieSliceInfo sliceInfo = new HoodieSliceInfo();
clusteringGroup.setSlices(Arrays.asList(sliceInfo));
clusteringPlan.setInputGroups(Arrays.asList(clusteringGroup));
requestedReplaceMetadata.setClusteringPlan(clusteringPlan);
requestedReplaceMetadata.setVersion(TimelineLayoutVersion.CURR_VERSION);
} else {
// inflightReplaceMetadata will be null in clustering but not null
// in insert_overwrite or insert_overwrite_table
inflightReplaceMetadata = new HoodieCommitMetadata();
inflightReplaceMetadata.setOperationType(writeOperationType);
inflightReplaceMetadata.setCompacted(false);
}
HoodieTestTable.of(metaClient).addReplaceCommit(instantTime, Option.ofNullable(requestedReplaceMetadata), Option.ofNullable(inflightReplaceMetadata), replaceMetadata).withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2);
}
use of org.apache.hudi.avro.model.HoodieClusteringPlan in project hudi by apache.
the class TestSimpleConcurrentFileWritesConflictResolutionStrategy method createReplace.
private void createReplace(String instantTime, WriteOperationType writeOperationType) throws Exception {
String fileId1 = "file-1";
String fileId2 = "file-2";
// create replace instant to mark fileId1 as deleted
HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata();
Map<String, List<String>> partitionFileIds = new HashMap<>();
partitionFileIds.put(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, Arrays.asList(fileId2));
replaceMetadata.setPartitionToReplaceFileIds(partitionFileIds);
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId("file-1");
replaceMetadata.addWriteStat(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, writeStat);
replaceMetadata.setOperationType(writeOperationType);
// create replace instant to mark fileId1 as deleted
HoodieRequestedReplaceMetadata requestedReplaceMetadata = new HoodieRequestedReplaceMetadata();
requestedReplaceMetadata.setOperationType(WriteOperationType.CLUSTER.name());
HoodieClusteringPlan clusteringPlan = new HoodieClusteringPlan();
HoodieClusteringGroup clusteringGroup = new HoodieClusteringGroup();
HoodieSliceInfo sliceInfo = new HoodieSliceInfo();
sliceInfo.setFileId(fileId1);
sliceInfo.setPartitionPath(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH);
clusteringGroup.setSlices(Arrays.asList(sliceInfo));
clusteringPlan.setInputGroups(Arrays.asList(clusteringGroup));
requestedReplaceMetadata.setClusteringPlan(clusteringPlan);
requestedReplaceMetadata.setVersion(TimelineLayoutVersion.CURR_VERSION);
HoodieTestTable.of(metaClient).addReplaceCommit(instantTime, Option.of(requestedReplaceMetadata), Option.empty(), replaceMetadata).withBaseFilesInPartition(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, fileId1, fileId2);
}
use of org.apache.hudi.avro.model.HoodieClusteringPlan in project hudi by apache.
the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan.
@Test
public void testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan() throws Exception {
final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
// create HoodieWriteConfig and set inline and async clustering disable here.
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(false).withAsyncClustering(false).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).build()).build();
// create file slice with instantTime 001 and build clustering plan including this created 001 file slice.
HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, "001", "1");
// create requested replace commit
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build();
FileCreateUtils.createRequestedReplaceCommit(basePath, "002", Option.of(requestedReplaceMetadata));
// create file slice 003
FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "3", 1);
FileCreateUtils.createCommit(basePath, "003");
metaClient = HoodieTableMetaClient.reload(metaClient);
// generate new data to be ingested
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
// create UpsertPartitioner
UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
// for now we have file slice1 and file slice3 and file slice1 is contained in pending clustering plan
// So that only file slice3 can be used for ingestion.
assertEquals(1, partitioner.smallFiles.size(), "Should have 1 small file to be ingested.");
}
use of org.apache.hudi.avro.model.HoodieClusteringPlan in project hudi by apache.
the class SingleSparkJobExecutionStrategy method performClustering.
@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier();
final SerializableSchema serializableSchema = new SerializableSchema(schema);
final List<ClusteringGroupInfo> clusteringGroupInfos = clusteringPlan.getInputGroups().stream().map(clusteringGroup -> ClusteringGroupInfo.create(clusteringGroup)).collect(Collectors.toList());
String umask = engineContext.hadoopConfiguration().get("fs.permissions.umask-mode");
Broadcast<String> umaskBroadcastValue = engineContext.broadcast(umask);
JavaRDD<ClusteringGroupInfo> groupInfoJavaRDD = engineContext.parallelize(clusteringGroupInfos, clusteringGroupInfos.size());
LOG.info("number of partitions for clustering " + groupInfoJavaRDD.getNumPartitions());
JavaRDD<WriteStatus> writeStatusRDD = groupInfoJavaRDD.mapPartitions(clusteringOps -> {
Configuration configuration = new Configuration();
configuration.set("fs.permissions.umask-mode", umaskBroadcastValue.getValue());
Iterable<ClusteringGroupInfo> clusteringOpsIterable = () -> clusteringOps;
List<ClusteringGroupInfo> groupsInPartition = StreamSupport.stream(clusteringOpsIterable.spliterator(), false).collect(Collectors.toList());
return groupsInPartition.stream().flatMap(clusteringOp -> runClusteringForGroup(clusteringOp, clusteringPlan.getStrategy().getStrategyParams(), Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), serializableSchema, taskContextSupplier, instantTime)).iterator();
});
HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD));
return writeMetadata;
}
use of org.apache.hudi.avro.model.HoodieClusteringPlan in project hudi by apache.
the class TestHoodieTableFileSystemView method testPendingClusteringOperations.
@Test
public void testPendingClusteringOperations() throws IOException {
String partitionPath1 = "2020/06/27";
new File(basePath + "/" + partitionPath1).mkdirs();
// create 2 fileId in partition1 - fileId1 is replaced later on.
String fileId1 = UUID.randomUUID().toString();
String fileId2 = UUID.randomUUID().toString();
String fileId3 = UUID.randomUUID().toString();
assertFalse(roView.getLatestBaseFiles(partitionPath1).anyMatch(dfile -> dfile.getFileId().equals(fileId1) || dfile.getFileId().equals(fileId2) || dfile.getFileId().equals(fileId3)), "No commit, should not find any data file");
// Only one commit
String commitTime1 = "1";
String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1);
String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2);
String fileName3 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId3);
new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile();
new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile();
new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile();
HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1);
saveAsComplete(commitTimeline, instant1, Option.empty());
refreshFsView();
assertEquals(1, roView.getLatestBaseFiles(partitionPath1).filter(dfile -> dfile.getFileId().equals(fileId1)).count());
assertEquals(1, roView.getLatestBaseFiles(partitionPath1).filter(dfile -> dfile.getFileId().equals(fileId2)).count());
assertEquals(1, roView.getLatestBaseFiles(partitionPath1).filter(dfile -> dfile.getFileId().equals(fileId3)).count());
List<FileSlice>[] fileSliceGroups = new List[] { Collections.singletonList(fsView.getLatestFileSlice(partitionPath1, fileId1).get()), Collections.singletonList(fsView.getLatestFileSlice(partitionPath1, fileId2).get()) };
// create pending clustering operation - fileId1, fileId2 are being clustered in different groups
HoodieClusteringPlan plan = ClusteringUtils.createClusteringPlan("strategy", new HashMap<>(), fileSliceGroups, Collections.emptyMap());
String clusterTime = "2";
HoodieInstant instant2 = new HoodieInstant(State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, clusterTime);
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(plan).setOperationType(WriteOperationType.CLUSTER.name()).build();
metaClient.getActiveTimeline().saveToPendingReplaceCommit(instant2, TimelineMetadataUtils.serializeRequestedReplaceMetadata(requestedReplaceMetadata));
// make sure view doesnt include fileId1
refreshFsView();
Set<String> fileIds = fsView.getFileGroupsInPendingClustering().map(e -> e.getLeft().getFileId()).collect(Collectors.toSet());
assertTrue(fileIds.contains(fileId1));
assertTrue(fileIds.contains(fileId2));
assertFalse(fileIds.contains(fileId3));
}
Aggregations