use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieMergeOnReadTable method testLogFileCountsAfterCompaction.
// TODO: Enable metadata virtual keys in this test once the feature HUDI-2593 is completed
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws Exception {
boolean populateMetaFields = true;
// insert 100 records
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true, false, HoodieIndex.IndexType.BLOOM, 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), preserveCommitMeta);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig config = cfgBuilder.build();
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
String newCommitTime = "100";
writeClient.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records
newCommitTime = "101";
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc().parallelize(updatedRecords, 1);
HoodieReadClient readClient = new HoodieReadClient(context(), config);
JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = readClient.tagLocation(updatedRecordsRDD);
writeClient.startCommitWithTime(newCommitTime);
writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
// Write them to corresponding avro logfiles
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, metadataWriter);
Set<String> allPartitions = updatedRecords.stream().map(record -> record.getPartitionPath()).collect(Collectors.groupingBy(partitionPath -> partitionPath)).keySet();
assertEquals(allPartitions.size(), testTable.listAllBaseFiles().length);
// Verify that all data file has one log file
HoodieTable table = HoodieSparkTable.create(config, context(), metaClient, true);
for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
for (FileSlice fileSlice : groupedLogFiles) {
assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for the latest data file - " + fileSlice);
}
}
// Do a compaction
String compactionInstantTime = writeClient.scheduleCompaction(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = writeClient.compact(compactionInstantTime);
// Verify that recently written compacted data file has no log file
metaClient = HoodieTableMetaClient.reload(metaClient);
table = HoodieSparkTable.create(config, context(), metaClient);
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
assertTrue(HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), HoodieTimeline.GREATER_THAN, newCommitTime), "Compaction commit should be > than last insert");
for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
for (FileSlice slice : groupedLogFiles) {
assertEquals(0, slice.getLogFiles().count(), "After compaction there should be no log files visible on a full view");
}
assertTrue(result.getCommitMetadata().get().getWritePartitionPaths().stream().anyMatch(part -> part.contentEquals(partitionPath)));
}
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath(), dataGen.getPartitionPaths()[i]);
}
Dataset<Row> actual = HoodieClientTestUtils.read(jsc(), basePath(), sqlContext(), fs(), fullPartitionPaths);
List<Row> rows = actual.collectAsList();
assertEquals(updatedRecords.size(), rows.size());
for (Row row : rows) {
assertEquals(row.getAs(HoodieRecord.COMMIT_TIME_METADATA_FIELD), preserveCommitMeta ? newCommitTime : compactionInstantTime);
}
}
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieMergeHandle method testUpsertsForMultipleRecordsInSameFile.
@ParameterizedTest
@MethodSource("testArguments")
public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
// Create records in a single partition
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
// Build a common config with diff configs
Properties properties = new Properties();
properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
// Build a write config with bulkinsertparallelism set
HoodieWriteConfig cfg = getConfigBuilder().withProperties(properties).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
FileSystem fs = FSUtils.getFs(basePath, hadoopConf);
/**
* Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times
* each. id1 (21 records), id2 (21 records), id3, id4
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 4);
HoodieRecord record1 = records.get(0);
HoodieRecord record2 = records.get(1);
for (int i = 0; i < 20; i++) {
HoodieRecord dup = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
records.add(dup);
}
for (int i = 0; i < 20; i++) {
HoodieRecord dup = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
records.add(dup);
}
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
List<WriteStatus> statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
// verify that there is a commit
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 001");
assertEquals(records.size(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain 44 records");
/**
* Write 2 (insert) This will do a bulk insert of 1 record with the same row_key as record1 in the previous insert
* - id1. At this point, we will have 2 files with the row_keys as shown here - File 1 - id1 (21 records), id2 (21
* records), id3, id4 File 2 - id1
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
// Do 1 more bulk insert with the same dup record1
List<HoodieRecord> newRecords = new ArrayList<>();
HoodieRecord sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
newRecords.add(sameAsRecord1);
writeRecords = jsc.parallelize(newRecords, 1);
statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
// verify that there are 2 commits
metaClient = HoodieTableMetaClient.reload(metaClient);
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(2, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting two commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 002");
Dataset<Row> dataSet = getRecords();
assertEquals(45, dataSet.count(), "Must contain 45 records");
/**
* Write 3 (insert) This will bulk insert 2 new completely new records. At this point, we will have 2 files with
* the row_keys as shown here - File 1 - id1 (21 records), id2 (21 records), id3, id4 File 2 - id1 File 3 - id5,
* id6
*/
newCommitTime = "003";
client.startCommitWithTime(newCommitTime);
newRecords = dataGen.generateInserts(newCommitTime, 2);
writeRecords = jsc.parallelize(newRecords, 1);
statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
// verify that there are now 3 commits
metaClient = HoodieTableMetaClient.reload(metaClient);
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(3, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting three commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 003");
dataSet = getRecords();
assertEquals(47, dataSet.count(), "Must contain 47 records");
/**
* Write 4 (updates) This will generate 2 upsert records with id1 and id2. The rider and driver names in the
* update records will be rider-004 and driver-004. After the upsert is complete, all the records with id1 in File
* 1 and File 2 must be updated, all the records with id2 in File 2 must also be updated. Also, none of the other
* records in File 1, File 2 and File 3 must be updated.
*/
newCommitTime = "004";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> updateRecords = new ArrayList<>();
// This exists in 001 and 002 and should be updated in both
sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
updateRecords.add(sameAsRecord1);
// This exists in 001 and should be updated
HoodieRecord sameAsRecord2 = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
updateRecords.add(sameAsRecord2);
JavaRDD<HoodieRecord> updateRecordsRDD = jsc.parallelize(updateRecords, 1);
statuses = client.upsert(updateRecordsRDD, newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
// verify there are now 4 commits
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
assertEquals(4, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting four commits.");
assertEquals(timeline.lastInstant().get().getTimestamp(), newCommitTime, "Latest commit should be 004");
// Check the entire dataset has 47 records still
dataSet = getRecords();
assertEquals(47, dataSet.count(), "Must contain 47 records");
Row[] rows = (Row[]) dataSet.collect();
int record1Count = 0;
int record2Count = 0;
for (Row row : rows) {
if (row.getAs("_hoodie_record_key").equals(record1.getKey().getRecordKey())) {
record1Count++;
// assert each duplicate record is updated
assertEquals(row.getAs("rider"), "rider-004");
assertEquals(row.getAs("driver"), "driver-004");
} else if (row.getAs("_hoodie_record_key").equals(record2.getKey().getRecordKey())) {
record2Count++;
// assert each duplicate record is updated
assertEquals(row.getAs("rider"), "rider-004");
assertEquals(row.getAs("driver"), "driver-004");
} else {
assertNotEquals(row.getAs("rider"), "rider-004");
assertNotEquals(row.getAs("driver"), "rider-004");
}
}
// Assert that id1 record count which has been updated to rider-004 and driver-004 is 22, which is the total
// number of records with row_key id1
assertEquals(22, record1Count);
// Assert that id2 record count which has been updated to rider-004 and driver-004 is 21, which is the total
// number of records with row_key id2
assertEquals(21, record2Count);
}
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieTimelineArchiver method testMergeSmallArchiveFilesRecoverFromDeleteFailed.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMergeSmallArchiveFilesRecoverFromDeleteFailed(boolean enableArchiveMerge) throws Exception {
HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
// do ingestion and trigger archive actions here.
for (int i = 1; i < 8; i++) {
testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
// do a single merge small archive files
HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
List<String> candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList());
archiver.reOpenWriter();
archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1");
archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList()));
archiver.reOpenWriter();
// delete only one of the small archive file to simulate delete action failed.
metaClient.getFs().delete(fsStatuses[0].getPath());
// loading archived timeline and active timeline success
HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload();
assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants());
// do another archive actions with merge small archive files.
for (int i = 1; i < 10; i++) {
testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
// check result.
HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload();
assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants());
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieTimelineArchiver method testLoadArchiveTimelineWithUncompletedMergeArchiveFile.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testLoadArchiveTimelineWithUncompletedMergeArchiveFile(boolean enableArchiveMerge) throws Exception {
HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
for (int i = 1; i < 8; i++) {
testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
List<String> candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList());
archiver.reOpenWriter();
archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1");
archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList()));
HoodieLogFormat.Writer writer = archiver.reOpenWriter();
String s = "Dummy Content";
// stain the current merged archive file.
FileIOUtils.createFileInPath(metaClient.getFs(), writer.getLogFile().getPath(), Option.of(s.getBytes()));
// if there's only a damaged merged archive file, we need to ignore the exception while reading this damaged file.
HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline();
assertEquals(7 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants());
// if there are a damaged merged archive files and other common damaged archive file.
// hoodie need throw ioe while loading archived timeline because of parsing the damaged archive file.
Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1");
FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes()));
assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload());
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieTimelineArchiver method testMergeSmallArchiveFilesRecoverFromBuildPlanFailed.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMergeSmallArchiveFilesRecoverFromBuildPlanFailed(boolean enableArchiveMerge) throws Exception {
HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
// do ingestion and trigger archive actions here.
for (int i = 1; i < 8; i++) {
testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
// build a merge small archive plan with dummy content
// this plan can not be deserialized.
HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
List<String> candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList());
archiver.reOpenWriter();
Path plan = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME);
archiver.buildArchiveMergePlan(candidateFiles, plan, ".commits_.archive.3_1-0-1");
String s = "Dummy Content";
// stain the current merge plan file.
FileIOUtils.createFileInPath(metaClient.getFs(), plan, Option.of(s.getBytes()));
// check that damaged plan file will not block archived timeline loading.
HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload();
assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants());
// trigger several archive after left damaged merge small archive file plan.
for (int i = 1; i < 10; i++) {
testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
archiveAndGetCommitsList(writeConfig);
}
// loading archived timeline and active timeline success
HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false);
HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload();
// check instant number
assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants());
// if there are damaged archive files and damaged plan, hoodie need throw ioe while loading archived timeline.
Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1");
FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes()));
assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload());
}
Aggregations