use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class HoodieClientTestBase method deleteBatch.
/**
* Helper to delete batch of hoodie keys and do regular assertions on the state after successful completion.
*
* @param client Hoodie Write Client
* @param newCommitTime New Commit Timestamp to be used
* @param prevCommitTime Commit Timestamp used in previous commit
* @param initCommitTime Begin Timestamp (usually "000")
* @param keyGenFunction Key Generation function
* @param deleteFn Write Function to be used for delete
* @param assertForCommit Enable Assertion of Writes
* @param expRecordsInThisCommit Expected number of records in this commit
* @param expTotalRecords Expected number of records when scanned
* @throws Exception in case of error
*/
public JavaRDD<WriteStatus> deleteBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, String initCommitTime, int numRecordsInThisCommit, Function<Integer, List<HoodieKey>> keyGenFunction, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filerForCommitTimeWithAssert) throws Exception {
// Delete 1 (only deletes)
client.startCommitWithTime(newCommitTime);
List<HoodieKey> keysToDelete = keyGenFunction.apply(numRecordsInThisCommit);
JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(keysToDelete, 1);
JavaRDD<WriteStatus> result = deleteFn.apply(client, deleteRecords, newCommitTime);
List<WriteStatus> statuses = result.collect();
assertNoWriteErrors(statuses);
// check the partition metadata is written out
assertPartitionMetadataForKeys(keysToDelete, fs);
// verify that there is a commit
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
if (assertForCommit) {
assertEquals(3, timeline.findInstantsAfter(initCommitTime, Integer.MAX_VALUE).countInstants(), "Expecting 3 commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be " + newCommitTime);
if (filerForCommitTimeWithAssert) {
// if meta cols are disabled, we can't do assertion based on assertion time
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain " + expRecordsInThisCommit + " records");
}
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
}
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + expTotalRecords + " records");
if (filerForCommitTimeWithAssert) {
// Check that the incremental consumption from prevCommitTime
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(prevCommitTime)), "Incremental consumption from " + prevCommitTime + " should give no records in latest commit," + " since it is a delete operation");
}
}
return result;
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class HoodieClientTestBase method writeBatchHelper.
private JavaRDD<WriteStatus> writeBatchHelper(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit, List<HoodieRecord> records, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit, boolean filterForCommitTimeWithAssert) throws IOException {
// Write 1 (only inserts)
client.startCommitWithTime(newCommitTime);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> result = writeFn.apply(client, writeRecords, newCommitTime);
List<WriteStatus> statuses = result.collect();
assertNoWriteErrors(statuses);
if (doCommit) {
client.commit(newCommitTime, result);
}
// check the partition metadata is written out
assertPartitionMetadataForRecords(records, fs);
// verify that there is a commit
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
if (assertForCommit) {
assertEquals(expTotalCommits, timeline.findInstantsAfter(initCommitTime, Integer.MAX_VALUE).countInstants(), "Expecting " + expTotalCommits + " commits.");
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be " + newCommitTime);
if (filterForCommitTimeWithAssert) {
// when meta cols are disabled, we can't really do per commit assertion.
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain " + expRecordsInThisCommit + " records");
}
// Check the entire dataset has all records still
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
for (int i = 0; i < fullPartitionPaths.length; i++) {
fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
}
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + expTotalRecords + " records");
if (filterForCommitTimeWithAssert) {
// Check that the incremental consumption from prevCommitTime
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(prevCommitTime)), "Incremental consumption from " + prevCommitTime + " should give all records in latest commit");
if (commitTimesBetweenPrevAndNew.isPresent()) {
commitTimesBetweenPrevAndNew.get().forEach(ct -> {
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), HoodieClientTestUtils.countRecordsOptionallySince(jsc, basePath, sqlContext, timeline, Option.of(ct)), "Incremental consumption from " + ct + " should give all records in latest commit");
});
}
}
}
return result;
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class HoodieClientTestUtils method getCommitMetadataForInstant.
private static Option<HoodieCommitMetadata> getCommitMetadataForInstant(HoodieTableMetaClient metaClient, HoodieInstant instant) {
try {
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
byte[] data = timeline.getInstantDetails(instant).get();
return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class));
} catch (Exception e) {
throw new HoodieException("Failed to read schema from commit metadata", e);
}
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testSimpleInsertAndUpdate.
@ParameterizedTest
@MethodSource
public void testSimpleInsertAndUpdate(HoodieFileFormat fileFormat, boolean populateMetaFields) throws Exception {
Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen();
properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
/*
* Write 1 (only inserts)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime);
assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
/*
* Write 2 (updates)
*/
newCommitTime = "004";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, 100);
updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
client.compact(compactionCommitTime);
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
hoodieTable.getHoodieView().sync();
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
assertTrue(dataFilesToRead.findAny().isPresent());
// verify that there is a commit
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants();
assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
assertTrue(HoodieTimeline.compareTimestamps("000", HoodieTimeline.LESSER_THAN, latestCompactionCommitTime));
if (cfg.populateMetaFields()) {
assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.of("000")), "Must contain 200 records");
} else {
assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.empty()));
}
}
}
use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.
the class BaseHoodieTableFileIndex method doRefresh.
private void doRefresh() {
long startTime = System.currentTimeMillis();
Map<PartitionPath, FileStatus[]> partitionFiles = loadPartitionPathFiles();
FileStatus[] allFiles = partitionFiles.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new);
metaClient.reloadActiveTimeline();
HoodieTimeline activeTimeline = getActiveTimeline();
Option<HoodieInstant> latestInstant = activeTimeline.lastInstant();
// TODO we can optimize the flow by:
// - First fetch list of files from instants of interest
// - Load FileStatus's
fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles);
Option<String> queryInstant = specifiedQueryInstant.or(() -> latestInstant.map(HoodieInstant::getTimestamp));
validate(activeTimeline, queryInstant);
if (tableType.equals(HoodieTableType.MERGE_ON_READ) && queryType.equals(HoodieTableQueryType.SNAPSHOT)) {
cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestMergedFileSlicesBeforeOrOn(partitionPath.path, queryInstant.get()).collect(Collectors.toList())).orElse(Collections.emptyList())));
} else {
cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestFileSlicesBeforeOrOn(partitionPath.path, instant, true)).orElse(fileSystemView.getLatestFileSlices(partitionPath.path)).collect(Collectors.toList())));
}
cachedFileSize = cachedAllInputFileSlices.values().stream().flatMap(Collection::stream).mapToLong(BaseHoodieTableFileIndex::fileSliceSize).sum();
// If the partition value contains InternalRow.empty, we query it as a non-partitioned table.
queryAsNonePartitionedTable = partitionFiles.keySet().stream().anyMatch(p -> p.values.length == 0);
long duration = System.currentTimeMillis() - startTime;
LOG.info(String.format("Refresh table %s, spent: %d ms", metaClient.getTableConfig().getTableName(), duration));
}
Aggregations