use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestMergeOnReadRollbackActionExecutor method setUpDFS.
private void setUpDFS() throws IOException {
initDFS();
initSparkContexts();
// just generate two partitions
dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
initFileSystem();
initDFSMetaClient();
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestMergeOnReadRollbackActionExecutor method testRollbackForCanIndexLogFile.
@Test
public void testRollbackForCanIndexLogFile() throws IOException {
cleanupResources();
setUpDFS();
// 1. prepare data and assert data result
// just generate one partitions
dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH });
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withBulkInsertParallelism(2).withFinalizeWriteParallelism(2).withDeleteParallelism(2).withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).withWriteStatusClass(MetadataMergeWriteStatus.class).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build()).forTable("test-trip-table").withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withEnableBackupForRemoteFileSystemView(// Fail test if problem connecting to timeline-server
false).withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build()).withRollbackUsingMarkers(false).withAutoCommit(false).build();
// 1. prepare data
new HoodieTestDataGenerator().writePartitionMetadata(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH }, basePath);
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
// Write 1 (only inserts)
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
org.apache.hudi.testutils.Assertions.assertNoWriteErrors(statuses.collect());
client.commit(newCommitTime, statuses);
// check fileSlice
HoodieTable table = this.getHoodieTable(metaClient, cfg);
SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
assertEquals(1, firstPartitionCommit2FileGroups.size());
assertEquals(1, (int) firstPartitionCommit2FileGroups.get(0).getAllFileSlices().count());
assertFalse(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getBaseFile().isPresent());
assertEquals(1, firstPartitionCommit2FileGroups.get(0).getAllFileSlices().findFirst().get().getLogFiles().count());
String generatedFileID = firstPartitionCommit2FileGroups.get(0).getFileGroupId().getFileId();
// check hoodieCommitMeta
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, "001")).get(), HoodieCommitMetadata.class);
List<HoodieWriteStat> firstPartitionWriteStat = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH);
assertEquals(2, firstPartitionWriteStat.size());
// we have an empty writeStat for all partition
assert firstPartitionWriteStat.stream().anyMatch(wStat -> StringUtils.isNullOrEmpty(wStat.getFileId()));
// we have one non-empty writeStat which must contains update or insert
assertEquals(1, firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).count());
firstPartitionWriteStat.stream().filter(wStat -> !StringUtils.isNullOrEmpty(wStat.getFileId())).forEach(wStat -> {
assert wStat.getNumInserts() > 0;
});
// Write 2 (inserts)
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> updateRecords = Collections.singletonList(dataGen.generateUpdateRecord(records.get(0).getKey(), newCommitTime));
List<HoodieRecord> insertRecordsInSamePartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_FIRST_PARTITION_PATH);
List<HoodieRecord> insertRecordsInOtherPartition = dataGen.generateInsertsForPartition(newCommitTime, 2, DEFAULT_SECOND_PARTITION_PATH);
List<HoodieRecord> recordsToBeWrite = Stream.concat(Stream.concat(updateRecords.stream(), insertRecordsInSamePartition.stream()), insertRecordsInOtherPartition.stream()).collect(Collectors.toList());
writeRecords = jsc.parallelize(recordsToBeWrite, 1);
statuses = client.upsert(writeRecords, newCommitTime);
client.commit(newCommitTime, statuses);
table = this.getHoodieTable(metaClient, cfg);
commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime)).get(), HoodieCommitMetadata.class);
assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_FIRST_PARTITION_PATH);
assert commitMetadata.getPartitionToWriteStats().containsKey(DEFAULT_SECOND_PARTITION_PATH);
List<HoodieWriteStat> hoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_FIRST_PARTITION_PATH);
// Both update and insert record should enter same existing fileGroup due to small file handling
assertEquals(1, hoodieWriteStatOptionList.size());
assertEquals(generatedFileID, hoodieWriteStatOptionList.get(0).getFileId());
// check insert and update numbers
assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts());
assertEquals(1, hoodieWriteStatOptionList.get(0).getNumUpdateWrites());
List<HoodieWriteStat> secondHoodieWriteStatOptionList = commitMetadata.getPartitionToWriteStats().get(DEFAULT_SECOND_PARTITION_PATH);
// All insert should enter one fileGroup
assertEquals(1, secondHoodieWriteStatOptionList.size());
String fileIdInPartitionTwo = secondHoodieWriteStatOptionList.get(0).getFileId();
assertEquals(2, hoodieWriteStatOptionList.get(0).getNumInserts());
// Rollback
HoodieInstant rollBackInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "002");
BaseRollbackPlanActionExecutor mergeOnReadRollbackPlanActionExecutor = new BaseRollbackPlanActionExecutor(context, cfg, table, "003", rollBackInstant, false, cfg.shouldRollbackUsingMarkers());
mergeOnReadRollbackPlanActionExecutor.execute().get();
MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor(context, cfg, table, "003", rollBackInstant, true, false);
// 3. assert the rollback stat
Map<String, HoodieRollbackPartitionMetadata> rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata();
assertEquals(2, rollbackMetadata.size());
// 4. assert filegroup after rollback, and compare to the rollbackstat
// assert the first partition data and log file size
HoodieRollbackPartitionMetadata partitionMetadata = rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH);
assertTrue(partitionMetadata.getSuccessDeleteFiles().isEmpty());
assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty());
assertEquals(1, partitionMetadata.getRollbackLogFiles().size());
// assert the second partition data and log file size
partitionMetadata = rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH);
assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size());
assertTrue(partitionMetadata.getFailedDeleteFiles().isEmpty());
assertTrue(partitionMetadata.getRollbackLogFiles().isEmpty());
assertEquals(1, partitionMetadata.getSuccessDeleteFiles().size());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestMergeOnReadRollbackActionExecutor method setUp.
@BeforeEach
public void setUp() throws Exception {
initPath();
initSparkContexts();
// just generate tow partitions
dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
initFileSystem();
initMetaClient();
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieSparkCopyOnWriteTableArchiveWithReplace method testDeletePartitionAndArchive.
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testDeletePartitionAndArchive(boolean metadataEnabled) throws IOException {
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.COPY_ON_WRITE);
HoodieWriteConfig writeConfig = getConfigBuilder(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).retainCommits(1).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(metadataEnabled).build()).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(writeConfig);
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(DEFAULT_PARTITION_PATHS)) {
// 1st write batch; 3 commits for 3 partitions
String instantTime1 = HoodieActiveTimeline.createNewInstantTime(1000);
client.startCommitWithTime(instantTime1);
client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime1, 10, DEFAULT_FIRST_PARTITION_PATH), 1), instantTime1);
String instantTime2 = HoodieActiveTimeline.createNewInstantTime(2000);
client.startCommitWithTime(instantTime2);
client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime2, 10, DEFAULT_SECOND_PARTITION_PATH), 1), instantTime2);
String instantTime3 = HoodieActiveTimeline.createNewInstantTime(3000);
client.startCommitWithTime(instantTime3);
client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime3, 1, DEFAULT_THIRD_PARTITION_PATH), 1), instantTime3);
final HoodieTimeline timeline1 = metaClient.getCommitsTimeline().filterCompletedInstants();
assertEquals(21, countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline1, Option.empty()));
// delete the 1st and the 2nd partition; 1 replace commit
final String instantTime4 = HoodieActiveTimeline.createNewInstantTime(4000);
client.startCommitWithTime(instantTime4, HoodieActiveTimeline.REPLACE_COMMIT_ACTION);
client.deletePartitions(Arrays.asList(DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH), instantTime4);
// 2nd write batch; 4 commits for the 3rd partition; the 3rd commit to trigger archiving the replace commit
for (int i = 5; i < 9; i++) {
String instantTime = HoodieActiveTimeline.createNewInstantTime(i * 1000);
client.startCommitWithTime(instantTime);
client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime, 1, DEFAULT_THIRD_PARTITION_PATH), 1), instantTime);
}
// verify archived timeline
metaClient = HoodieTableMetaClient.reload(metaClient);
final HoodieTimeline archivedTimeline = metaClient.getArchivedTimeline();
assertTrue(archivedTimeline.containsInstant(instantTime1));
assertTrue(archivedTimeline.containsInstant(instantTime2));
assertTrue(archivedTimeline.containsInstant(instantTime3));
assertTrue(archivedTimeline.containsInstant(instantTime4), "should contain the replace commit.");
// verify records
final HoodieTimeline timeline2 = metaClient.getCommitTimeline().filterCompletedInstants();
assertEquals(5, countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline2, Option.empty()), "should only have the 4 records from the 3rd partition.");
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestBulkInsertInternalPartitioner method generateTestRecordsForBulkInsert.
public static JavaRDD<HoodieRecord> generateTestRecordsForBulkInsert(JavaSparkContext jsc) {
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
// RDD partition 1
List<HoodieRecord> records1 = dataGenerator.generateInserts("0", 100);
// RDD partition 2
List<HoodieRecord> records2 = dataGenerator.generateInserts("0", 150);
return jsc.parallelize(records1, 1).union(jsc.parallelize(records2, 1));
}
Aggregations