use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingWithCanIndexLogFiles.
@Test
public void testUpsertPartitionerWithSmallFileHandlingWithCanIndexLogFiles() throws Exception {
// Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan()
final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1024).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()).build()).build();
// Create file group with only one log file
FileCreateUtils.createLogFile(basePath, testPartitionPath, "001", "fg1", 1);
FileCreateUtils.createDeltaCommit(basePath, "001");
// Create another file group size set to max parquet file size so should not be considered during small file sizing
FileCreateUtils.createBaseFile(basePath, testPartitionPath, "002", "fg2", 1024);
FileCreateUtils.createCommit(basePath, "002");
FileCreateUtils.createLogFile(basePath, testPartitionPath, "003", "fg2", 1);
FileCreateUtils.createDeltaCommit(basePath, "003");
// Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
// Default estimated record size will be 1024 based on last file group created. Only 1 record can be added to small file
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 1);
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config);
assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions");
assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, "Bucket 0 should be UPDATE");
assertEquals("fg1", partitioner.getBucketInfo(0).fileIdPrefix, "Insert should be assigned to fg1");
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingPickingMultipleCandidates.
@Test
public void testUpsertPartitionerWithSmallFileHandlingPickingMultipleCandidates() throws Exception {
final String partitionPath = DEFAULT_PARTITION_PATHS[0];
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMergeSmallFileGroupCandidatesLimit(3).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(2048).build()).build();
// Bootstrap base files ("small-file targets")
FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-1", 1024);
FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-2", 1024);
FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-3", 1024);
FileCreateUtils.createCommit(basePath, "002");
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { partitionPath });
// Default estimated record size will be 1024 based on last file group created.
// Only 1 record can be added to small file
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(dataGenerator.generateInserts("003", 3))));
HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(this.metaClient);
HoodieSparkTable<?> table = HoodieSparkTable.create(config, context, reloadedMetaClient);
SparkUpsertDeltaCommitPartitioner<?> partitioner = new SparkUpsertDeltaCommitPartitioner<>(profile, context, table, config);
assertEquals(3, partitioner.numPartitions());
assertEquals(Arrays.asList(new BucketInfo(BucketType.UPDATE, "fg-1", partitionPath), new BucketInfo(BucketType.UPDATE, "fg-2", partitionPath), new BucketInfo(BucketType.UPDATE, "fg-3", partitionPath)), partitioner.getBucketInfos());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieMergeHandle method testUpsertsForMultipleRecordsInSameFile.
@ParameterizedTest
@MethodSource("testArguments")
public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
// Create records in a single partition
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
// Build a common config with diff configs
Properties properties = new Properties();
properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
// Build a write config with bulkinsertparallelism set
HoodieWriteConfig cfg = getConfigBuilder().withProperties(properties).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
FileSystem fs = FSUtils.getFs(basePath, hadoopConf);
/**
* Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times
* each. id1 (21 records), id2 (21 records), id3, id4
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 4);
HoodieRecord record1 = records.get(0);
HoodieRecord record2 = records.get(1);
for (int i = 0; i < 20; i++) {
HoodieRecord dup = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
records.add(dup);
}
for (int i = 0; i < 20; i++) {
HoodieRecord dup = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
records.add(dup);
}
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
List<WriteStatus> statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
// verify that there is a commit
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 001");
assertEquals(records.size(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain 44 records");
/**
* Write 2 (insert) This will do a bulk insert of 1 record with the same row_key as record1 in the previous insert
* - id1. At this point, we will have 2 files with the row_keys as shown here - File 1 - id1 (21 records), id2 (21
* records), id3, id4 File 2 - id1
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
// Do 1 more bulk insert with the same dup record1
List<HoodieRecord> newRecords = new ArrayList<>();
HoodieRecord sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
newRecords.add(sameAsRecord1);
writeRecords = jsc.parallelize(newRecords, 1);
statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
// verify that there are 2 commits
metaClient = HoodieTableMetaClient.reload(metaClient);
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(2, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting two commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 002");
Dataset<Row> dataSet = getRecords();
assertEquals(45, dataSet.count(), "Must contain 45 records");
/**
* Write 3 (insert) This will bulk insert 2 new completely new records. At this point, we will have 2 files with
* the row_keys as shown here - File 1 - id1 (21 records), id2 (21 records), id3, id4 File 2 - id1 File 3 - id5,
* id6
*/
newCommitTime = "003";
client.startCommitWithTime(newCommitTime);
newRecords = dataGen.generateInserts(newCommitTime, 2);
writeRecords = jsc.parallelize(newRecords, 1);
statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
// verify that there are now 3 commits
metaClient = HoodieTableMetaClient.reload(metaClient);
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(3, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting three commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 003");
dataSet = getRecords();
assertEquals(47, dataSet.count(), "Must contain 47 records");
/**
* Write 4 (updates) This will generate 2 upsert records with id1 and id2. The rider and driver names in the
* update records will be rider-004 and driver-004. After the upsert is complete, all the records with id1 in File
* 1 and File 2 must be updated, all the records with id2 in File 2 must also be updated. Also, none of the other
* records in File 1, File 2 and File 3 must be updated.
*/
newCommitTime = "004";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> updateRecords = new ArrayList<>();
// This exists in 001 and 002 and should be updated in both
sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
updateRecords.add(sameAsRecord1);
// This exists in 001 and should be updated
HoodieRecord sameAsRecord2 = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
updateRecords.add(sameAsRecord2);
JavaRDD<HoodieRecord> updateRecordsRDD = jsc.parallelize(updateRecords, 1);
statuses = client.upsert(updateRecordsRDD, newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
// verify there are now 4 commits
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(4, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting four commits.");
assertEquals(timeline.lastInstant().get().getTimestamp(), newCommitTime, "Latest commit should be 004");
// Check the entire dataset has 47 records still
dataSet = getRecords();
assertEquals(47, dataSet.count(), "Must contain 47 records");
Row[] rows = (Row[]) dataSet.collect();
int record1Count = 0;
int record2Count = 0;
for (Row row : rows) {
if (row.getAs("_hoodie_record_key").equals(record1.getKey().getRecordKey())) {
record1Count++;
// assert each duplicate record is updated
assertEquals(row.getAs("rider"), "rider-004");
assertEquals(row.getAs("driver"), "driver-004");
} else if (row.getAs("_hoodie_record_key").equals(record2.getKey().getRecordKey())) {
record2Count++;
// assert each duplicate record is updated
assertEquals(row.getAs("rider"), "rider-004");
assertEquals(row.getAs("driver"), "driver-004");
} else {
assertNotEquals(row.getAs("rider"), "rider-004");
assertNotEquals(row.getAs("driver"), "rider-004");
}
}
// Assert that id1 record count which has been updated to rider-004 and driver-004 is 22, which is the total
// number of records with row_key id1
assertEquals(22, record1Count);
// Assert that id2 record count which has been updated to rider-004 and driver-004 is 21, which is the total
// number of records with row_key id2
assertEquals(21, record2Count);
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class HoodieClientRollbackTestBase method twoUpsertCommitDataWithTwoPartitions.
protected void twoUpsertCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, boolean commitSecondUpsert) throws IOException {
// just generate two partitions
dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
// 1. prepare data
HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
/**
* Write 1 (only inserts)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
Assertions.assertNoWriteErrors(statuses.collect());
client.commit(newCommitTime, statuses);
/**
* Write 2 (updates)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, records);
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime);
Assertions.assertNoWriteErrors(statuses.collect());
if (commitSecondUpsert) {
client.commit(newCommitTime, statuses);
}
// 2. assert file group and get the first partition file slice
HoodieTable table = this.getHoodieTable(metaClient, cfg);
SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, firstPartitionCommit2FileGroups.size());
firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
// 3. assert file group and get the second partition file slice
List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, secondPartitionCommit2FileGroups.size());
secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
// 4. assert file slice
HoodieTableType tableType = this.getTableType();
if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) {
assertEquals(2, firstPartitionCommit2FileSlices.size());
assertEquals(2, secondPartitionCommit2FileSlices.size());
} else {
assertEquals(1, firstPartitionCommit2FileSlices.size());
assertEquals(1, secondPartitionCommit2FileSlices.size());
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class HoodieClientRollbackTestBase method insertOverwriteCommitDataWithTwoPartitions.
protected void insertOverwriteCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, boolean commitSecondInsertOverwrite) throws IOException {
// just generate two partitions
dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
/**
* Write 1 (upsert)
*/
String newCommitTime = "001";
List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
client.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
Assertions.assertNoWriteErrors(statuses.collect());
client.commit(newCommitTime, statuses);
// get fileIds written
HoodieTable table = this.getHoodieTable(metaClient, cfg);
SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
List<HoodieFileGroup> firstPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, firstPartitionCommit1FileGroups.size());
Set<String> partition1Commit1FileIds = firstPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet());
List<HoodieFileGroup> secondPartitionCommit1FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, secondPartitionCommit1FileGroups.size());
Set<String> partition2Commit1FileIds = secondPartitionCommit1FileGroups.get(0).getAllFileSlices().map(FileSlice::getFileId).collect(Collectors.toSet());
/**
* Write 2 (one insert_overwrite)
*/
String commitActionType = HoodieTimeline.REPLACE_COMMIT_ACTION;
newCommitTime = "002";
records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
writeRecords = jsc.parallelize(records, 1);
client.startCommitWithTime(newCommitTime, commitActionType);
HoodieWriteResult result = client.insertOverwrite(writeRecords, newCommitTime);
statuses = result.getWriteStatuses();
Assertions.assertNoWriteErrors(statuses.collect());
if (commitSecondInsertOverwrite) {
client.commit(newCommitTime, statuses, Option.empty(), commitActionType, result.getPartitionToReplaceFileIds());
}
metaClient.reloadActiveTimeline();
// get new fileIds written as part of insert_overwrite
fsView = getFileSystemViewWithUnCommittedSlices(metaClient);
List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).filter(fg -> !partition1Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList());
firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).filter(fg -> !partition2Commit1FileIds.contains(fg.getFileGroupId().getFileId())).collect(Collectors.toList());
secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
assertEquals(1, firstPartitionCommit2FileSlices.size());
assertEquals(1, secondPartitionCommit2FileSlices.size());
}
Aggregations