use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.
the class RunCompactionActionExecutor method execute.
@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute() {
HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline();
compactor.preCompact(table, pendingCompactionTimeline, instantTime);
HoodieWriteMetadata<HoodieData<WriteStatus>> compactionMetadata = new HoodieWriteMetadata<>();
try {
// generate compaction plan
// should support configurable commit metadata
HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime);
HoodieData<WriteStatus> statuses = compactor.compact(context, compactionPlan, table, config, instantTime, compactionHandler);
compactor.maybePersist(statuses, config);
context.setJobStatus(this.getClass().getSimpleName(), "Preparing compaction metadata");
List<HoodieWriteStat> updateStatusMap = statuses.map(WriteStatus::getStat).collectAsList();
HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
for (HoodieWriteStat stat : updateStatusMap) {
metadata.addWriteStat(stat.getPartitionPath(), stat);
}
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema());
compactionMetadata.setWriteStatuses(statuses);
compactionMetadata.setCommitted(false);
compactionMetadata.setCommitMetadata(Option.of(metadata));
} catch (IOException e) {
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
}
return compactionMetadata;
}
use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpsertsInternal.
/**
* Test one of HoodieWriteClient upsert(Prepped) APIs.
*
* @param config Write Config
* @param writeFn One of Hoodie Write Function API
* @throws Exception in case of error
*/
private void testUpsertsInternal(HoodieWriteConfig config, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPrepped) throws Exception {
// Force using older timeline layout
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withRollbackUsingMarkers(true).withProps(config.getProps()).withTimelineLayoutVersion(VERSION_0).build();
HoodieTableMetaClient.withPropertyBuilder().fromMetaClient(metaClient).setTimelineLayoutVersion(VERSION_0).setPopulateMetaFields(config.populateMetaFields()).initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
// Write 1 (only inserts)
String newCommitTime = "001";
String initCommitTime = "000";
int numRecords = 200;
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert, isPrepped, true, numRecords, config.populateMetaFields());
// Write 2 (updates)
String prevCommitTime = newCommitTime;
newCommitTime = "004";
numRecords = 100;
String commitTimeBetweenPrevAndNew = "002";
updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true, numRecords, 200, 2, config.populateMetaFields());
// Delete 1
prevCommitTime = newCommitTime;
newCommitTime = "005";
numRecords = 50;
deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, 0, 150, config.populateMetaFields());
// Now simulate an upgrade and perform a restore operation
HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion(TimelineLayoutVersion.CURR_VERSION).build();
client = getHoodieWriteClient(newConfig);
client.savepoint("004", "user1", "comment1");
client.restoreToInstant("004");
assertFalse(metaClient.reloadActiveTimeline().getRollbackTimeline().lastInstant().isPresent());
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
}
assertEquals(200, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + 200 + " records");
// Perform Delete again on upgraded dataset.
prevCommitTime = newCommitTime;
newCommitTime = "006";
numRecords = 50;
deleteBatch(newConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true, 0, 150);
HoodieActiveTimeline activeTimeline = new HoodieActiveTimeline(metaClient, false);
List<HoodieInstant> instants = activeTimeline.getCommitTimeline().getInstants().collect(Collectors.toList());
assertEquals(5, instants.size());
assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "001"), instants.get(0));
assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "004"), instants.get(1));
// New Format should have all states of instants
assertEquals(new HoodieInstant(REQUESTED, COMMIT_ACTION, "006"), instants.get(2));
assertEquals(new HoodieInstant(INFLIGHT, COMMIT_ACTION, "006"), instants.get(3));
assertEquals(new HoodieInstant(COMPLETED, COMMIT_ACTION, "006"), instants.get(4));
final HoodieWriteConfig cfg = hoodieWriteConfig;
final String instantTime = "007";
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build();
String basePathStr = basePath;
HoodieTable table = getHoodieTable(metaClient, cfg);
String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
jsc.parallelize(Arrays.asList(1)).map(e -> {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(metaClient.getActiveTimeline().getInstantDetails(metaClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get()).get(), HoodieCommitMetadata.class);
String filePath = commitMetadata.getPartitionToWriteStats().values().stream().flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(extension)).findAny().map(ee -> ee.getPath()).orElse(null);
String partitionPath = commitMetadata.getPartitionToWriteStats().values().stream().flatMap(w -> w.stream()).filter(s -> s.getPath().endsWith(extension)).findAny().map(ee -> ee.getPartitionPath()).orElse(null);
Path baseFilePath = new Path(basePathStr, filePath);
HoodieBaseFile baseFile = new HoodieBaseFile(baseFilePath.toString());
try {
HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(), partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(), config.populateMetaFields() ? Option.empty() : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
WriteStatus writeStatus = new WriteStatus(false, 0.0);
writeStatus.setStat(new HoodieWriteStat());
writeStatus.getStat().setNumWrites(0);
handle.performMergeDataValidationCheck(writeStatus);
} catch (HoodieCorruptedDataException e1) {
fail("Exception not expected because merge validation check is disabled");
}
try {
final String newInstantTime = "006";
cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true");
HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build();
HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(), partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(), config.populateMetaFields() ? Option.empty() : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
WriteStatus writeStatus = new WriteStatus(false, 0.0);
writeStatus.setStat(new HoodieWriteStat());
writeStatus.getStat().setNumWrites(0);
handle.performMergeDataValidationCheck(writeStatus);
fail("The above line should have thrown an exception");
} catch (HoodieCorruptedDataException e2) {
// expected
}
return true;
}).collect();
}
use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testMetadataStatsOnCommit.
/**
* Test to ensure commit metadata points to valid files.10.
*/
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testMetadataStatsOnCommit(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
String instantTime0 = "000";
client.startCommitWithTime(instantTime0);
List<HoodieRecord> records0 = dataGen.generateInserts(instantTime0, 200);
JavaRDD<HoodieRecord> writeRecords0 = jsc.parallelize(records0, 1);
JavaRDD<WriteStatus> result0 = client.bulkInsert(writeRecords0, instantTime0);
assertTrue(client.commit(instantTime0, result0), "Commit should succeed");
assertTrue(testTable.commitExists(instantTime0), "After explicit commit, commit file should be created");
// Read from commit file
try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime0))) {
String everything = FileIOUtils.readAsUTFString(inputStream);
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class);
int inserts = 0;
for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
for (HoodieWriteStat stat : pstat.getValue()) {
inserts += stat.getNumInserts();
}
}
assertEquals(200, inserts);
}
// Update + Inserts such that they just expand file1
String instantTime1 = "001";
client.startCommitWithTime(instantTime1);
List<HoodieRecord> records1 = dataGen.generateUpdates(instantTime1, records0);
JavaRDD<HoodieRecord> writeRecords1 = jsc.parallelize(records1, 1);
JavaRDD<WriteStatus> result1 = client.upsert(writeRecords1, instantTime1);
assertTrue(client.commit(instantTime1, result1), "Commit should succeed");
assertTrue(testTable.commitExists(instantTime1), "After explicit commit, commit file should be created");
// Read from commit file
try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime1))) {
String everything = FileIOUtils.readAsUTFString(inputStream);
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class);
int inserts = 0;
int upserts = 0;
for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
for (HoodieWriteStat stat : pstat.getValue()) {
inserts += stat.getNumInserts();
upserts += stat.getNumUpdateWrites();
}
}
assertEquals(0, inserts);
assertEquals(200, upserts);
}
}
use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.
the class HoodieBulkInsertInternalWriterTestBase method assertWriteStatuses.
protected void assertWriteStatuses(List<HoodieInternalWriteStatus> writeStatuses, int batches, int size, boolean areRecordsSorted, Option<List<String>> fileAbsPaths, Option<List<String>> fileNames) {
if (areRecordsSorted) {
assertEquals(batches, writeStatuses.size());
} else {
assertEquals(Math.min(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS.length, batches), writeStatuses.size());
}
Map<String, Long> sizeMap = new HashMap<>();
if (!areRecordsSorted) {
// per write status
for (int i = 0; i < batches; i++) {
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[i % 3];
if (!sizeMap.containsKey(partitionPath)) {
sizeMap.put(partitionPath, 0L);
}
sizeMap.put(partitionPath, sizeMap.get(partitionPath) + size);
}
}
int counter = 0;
for (HoodieInternalWriteStatus writeStatus : writeStatuses) {
// verify write status
assertEquals(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3], writeStatus.getPartitionPath());
if (areRecordsSorted) {
assertEquals(writeStatus.getTotalRecords(), size);
} else {
assertEquals(writeStatus.getTotalRecords(), sizeMap.get(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3]));
}
assertNull(writeStatus.getGlobalError());
assertEquals(writeStatus.getFailedRowsSize(), 0);
assertEquals(writeStatus.getTotalErrorRecords(), 0);
assertFalse(writeStatus.hasErrors());
assertNotNull(writeStatus.getFileId());
String fileId = writeStatus.getFileId();
if (fileAbsPaths.isPresent()) {
fileAbsPaths.get().add(basePath + "/" + writeStatus.getStat().getPath());
}
if (fileNames.isPresent()) {
fileNames.get().add(writeStatus.getStat().getPath().substring(writeStatus.getStat().getPath().lastIndexOf('/') + 1));
}
HoodieWriteStat writeStat = writeStatus.getStat();
if (areRecordsSorted) {
assertEquals(size, writeStat.getNumInserts());
assertEquals(size, writeStat.getNumWrites());
} else {
assertEquals(sizeMap.get(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3]), writeStat.getNumInserts());
assertEquals(sizeMap.get(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter % 3]), writeStat.getNumWrites());
}
assertEquals(fileId, writeStat.getFileId());
assertEquals(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[counter++ % 3], writeStat.getPartitionPath());
assertEquals(0, writeStat.getNumDeletes());
assertEquals(0, writeStat.getNumUpdateWrites());
assertEquals(0, writeStat.getTotalWriteErrors());
}
}
use of org.apache.hudi.common.model.HoodieWriteStat in project hudi by apache.
the class RollbackUtils method generateRollbackRequestsUsingFileListingMOR.
/**
* Generate all rollback requests that we need to perform for rolling back this action without actually performing rolling back for MOR table type.
*
* @param instantToRollback Instant to Rollback
* @param table instance of {@link HoodieTable} to use.
* @param context instance of {@link HoodieEngineContext} to use.
* @return list of rollback requests
*/
public static List<ListingBasedRollbackRequest> generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException {
String commit = instantToRollback.getTimestamp();
HoodieWriteConfig config = table.getConfig();
List<String> partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getBasePath(), false, false);
if (partitions.isEmpty()) {
return new ArrayList<>();
}
int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests");
return context.flatMap(partitions, partitionPath -> {
HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline();
List<ListingBasedRollbackRequest> partitionRollbackRequests = new ArrayList<>();
switch(instantToRollback.getAction()) {
case HoodieTimeline.COMMIT_ACTION:
case HoodieTimeline.REPLACE_COMMIT_ACTION:
LOG.info("Rolling back commit action.");
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
break;
case HoodieTimeline.COMPACTION_ACTION:
// If there is no delta commit present after the current commit (if compaction), no action, else we
// need to make sure that a compaction commit rollback also deletes any log files written as part of the
// succeeding deltacommit.
boolean higherDeltaCommits = !activeTimeline.getDeltaCommitTimeline().filterCompletedInstants().findInstantsAfter(commit, 1).empty();
if (higherDeltaCommits) {
// Rollback of a compaction action with no higher deltacommit means that the compaction is scheduled
// and has not yet finished. In this scenario we should delete only the newly created base files
// and not corresponding base commit log files created with this as baseCommit since updates would
// have been written to the log files.
LOG.info("Rolling back compaction. There are higher delta commits. So only deleting data files");
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataFilesOnlyAction(partitionPath));
} else {
// No deltacommits present after this compaction commit (inflight or requested). In this case, we
// can also delete any log files that were created with this compaction commit as base
// commit.
LOG.info("Rolling back compaction plan. There are NO higher delta commits. So deleting both data and" + " log files");
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
}
break;
case HoodieTimeline.DELTA_COMMIT_ACTION:
// --------------------------------------------------------------------------------------------------
// (A) The following cases are possible if index.canIndexLogFiles and/or index.isGlobal
// --------------------------------------------------------------------------------------------------
// (A.1) Failed first commit - Inserts were written to log files and HoodieWriteStat has no entries. In
// this scenario we would want to delete these log files.
// (A.2) Failed recurring commit - Inserts/Updates written to log files. In this scenario,
// HoodieWriteStat will have the baseCommitTime for the first log file written, add rollback blocks.
// (A.3) Rollback triggered for first commit - Inserts were written to the log files but the commit is
// being reverted. In this scenario, HoodieWriteStat will be `null` for the attribute prevCommitTime and
// and hence will end up deleting these log files. This is done so there are no orphan log files
// lying around.
// (A.4) Rollback triggered for recurring commits - Inserts/Updates are being rolled back, the actions
// taken in this scenario is a combination of (A.2) and (A.3)
// ---------------------------------------------------------------------------------------------------
// (B) The following cases are possible if !index.canIndexLogFiles and/or !index.isGlobal
// ---------------------------------------------------------------------------------------------------
// (B.1) Failed first commit - Inserts were written to base files and HoodieWriteStat has no entries.
// In this scenario, we delete all the base files written for the failed commit.
// (B.2) Failed recurring commits - Inserts were written to base files and updates to log files. In
// this scenario, perform (A.1) and for updates written to log files, write rollback blocks.
// (B.3) Rollback triggered for first commit - Same as (B.1)
// (B.4) Rollback triggered for recurring commits - Same as (B.2) plus we need to delete the log files
// as well if the base base file gets deleted.
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(table.getMetaClient().getCommitTimeline().getInstantDetails(instantToRollback).get(), HoodieCommitMetadata.class);
// In case all data was inserts and the commit failed, delete the file belonging to that commit
// We do not know fileIds for inserts (first inserts are either log files or base files),
// delete all files for the corresponding failed commit, if present (same as COW)
partitionRollbackRequests.add(ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath));
// append rollback blocks for updates and inserts as A.2 and B.2
if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) {
partitionRollbackRequests.addAll(generateAppendRollbackBlocksAction(partitionPath, instantToRollback, commitMetadata, table));
}
break;
default:
break;
}
return partitionRollbackRequests.stream();
}, Math.min(partitions.size(), sparkPartitions)).stream().filter(Objects::nonNull).collect(Collectors.toList());
}
Aggregations