use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class CompactionTestBase method runNextDeltaCommits.
protected List<HoodieRecord> runNextDeltaCommits(SparkRDDWriteClient client, final HoodieReadClient readClient, List<String> deltaInstants, List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants) throws Exception {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
List<Pair<String, HoodieCompactionPlan>> pendingCompactions = readClient.getPendingCompactions();
List<String> gotPendingCompactionInstants = pendingCompactions.stream().map(pc -> pc.getKey()).sorted().collect(Collectors.toList());
assertEquals(expPendingCompactionInstants, gotPendingCompactionInstants);
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToCompactionOperation = CompactionUtils.getAllPendingCompactionOperations(metaClient);
if (insertFirst) {
// Use first instant for inserting records
String firstInstant = deltaInstants.get(0);
deltaInstants = deltaInstants.subList(1, deltaInstants.size());
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
client.startCommitWithTime(firstInstant);
JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, firstInstant);
List<WriteStatus> statusList = statuses.collect();
if (!cfg.shouldAutoCommit()) {
client.commit(firstInstant, statuses);
}
assertNoWriteErrors(statusList);
metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
List<HoodieBaseFile> dataFilesToRead = getCurrentLatestBaseFiles(hoodieTable);
assertTrue(dataFilesToRead.stream().findAny().isPresent(), "should list the base files we wrote in the delta commit");
validateDeltaCommit(firstInstant, fgIdToCompactionOperation, cfg);
}
int numRecords = records.size();
for (String instantTime : deltaInstants) {
records = dataGen.generateUpdates(instantTime, numRecords);
metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
createNextDeltaCommit(instantTime, records, client, metaClient, cfg, false);
validateDeltaCommit(instantTime, fgIdToCompactionOperation, cfg);
}
return records;
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestUpgradeDowngrade method testUpgradeZeroToOneInternal.
public void testUpgradeZeroToOneInternal(boolean induceResiduesFromPrevUpgrade, boolean deletePartialMarkerFiles, HoodieTableType tableType) throws IOException {
// init config, table and client.
Map<String, String> params = new HashMap<>();
if (tableType == HoodieTableType.MERGE_ON_READ) {
params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
}
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(false).withProps(params).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
// prepare data. Make 2 commits, in which 2nd is not committed.
List<FileSlice> firstPartitionCommit2FileSlices = new ArrayList<>();
List<FileSlice> secondPartitionCommit2FileSlices = new ArrayList<>();
Pair<List<HoodieRecord>, List<HoodieRecord>> inputRecords = twoUpsertCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, client, false);
HoodieTable table = this.getHoodieTable(metaClient, cfg);
HoodieInstant commitInstant = table.getPendingCommitTimeline().lastInstant().get();
// delete one of the marker files in 2nd commit if need be.
WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp());
List<String> markerPaths = new ArrayList<>(writeMarkers.allMarkerFilePaths());
if (deletePartialMarkerFiles) {
String toDeleteMarkerFile = markerPaths.get(0);
table.getMetaClient().getFs().delete(new Path(table.getMetaClient().getTempFolderPath() + "/" + commitInstant.getTimestamp() + "/" + toDeleteMarkerFile));
markerPaths.remove(toDeleteMarkerFile);
}
// set hoodie.table.version to 0 in hoodie.properties file
metaClient.getTableConfig().setTableVersion(HoodieTableVersion.ZERO);
if (induceResiduesFromPrevUpgrade) {
createResidualFile();
}
// should re-create marker files for 2nd commit since its pending.
new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(HoodieTableVersion.ONE, null);
// assert marker files
assertMarkerFilesForUpgrade(table, commitInstant, firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices);
// verify hoodie.table.version got upgraded
metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()).setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build();
assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), HoodieTableVersion.ONE.versionCode());
assertTableVersionFromPropertyFile(HoodieTableVersion.ONE);
// trigger 3rd commit with marker based rollback enabled.
/* HUDI-2310
List<HoodieRecord> thirdBatch = triggerCommit("003", tableType, true);
// Check the entire dataset has all records only from 1st commit and 3rd commit since 2nd is expected to be rolledback.
assertRows(inputRecords.getKey(), thirdBatch);
if (induceResiduesFromPrevUpgrade) {
assertFalse(dfs.exists(new Path(metaClient.getMetaPath(), SparkUpgradeDowngrade.HOODIE_UPDATED_PROPERTY_FILE)));
}*/
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestUpgradeDowngrade method testDowngrade.
@ParameterizedTest(name = TEST_NAME_WITH_DOWNGRADE_PARAMS)
@MethodSource("downGradeConfigParams")
public void testDowngrade(boolean deletePartialMarkerFiles, HoodieTableType tableType, HoodieTableVersion fromVersion) throws IOException {
MarkerType markerType = fromVersion == HoodieTableVersion.TWO ? MarkerType.TIMELINE_SERVER_BASED : MarkerType.DIRECT;
// init config, table and client.
Map<String, String> params = new HashMap<>();
if (fromVersion == HoodieTableVersion.TWO) {
addNewTableParamsToProps(params);
}
if (tableType == HoodieTableType.MERGE_ON_READ) {
params.put(TYPE.key(), HoodieTableType.MERGE_ON_READ.name());
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
}
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withRollbackUsingMarkers(true).withMarkersType(markerType.name()).withProps(params).build();
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
if (fromVersion == HoodieTableVersion.TWO) {
// set table configs
HoodieTableConfig tableConfig = metaClient.getTableConfig();
tableConfig.setValue(HoodieTableConfig.NAME, cfg.getTableName());
tableConfig.setValue(HoodieTableConfig.PARTITION_FIELDS, cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()));
tableConfig.setValue(HoodieTableConfig.RECORDKEY_FIELDS, cfg.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key()));
tableConfig.setValue(BASE_FILE_FORMAT, cfg.getString(BASE_FILE_FORMAT));
}
// prepare data. Make 2 commits, in which 2nd is not committed.
List<FileSlice> firstPartitionCommit2FileSlices = new ArrayList<>();
List<FileSlice> secondPartitionCommit2FileSlices = new ArrayList<>();
Pair<List<HoodieRecord>, List<HoodieRecord>> inputRecords = twoUpsertCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, client, false);
HoodieTable table = this.getHoodieTable(metaClient, cfg);
HoodieInstant commitInstant = table.getPendingCommitTimeline().lastInstant().get();
// delete one of the marker files in 2nd commit if need be.
WriteMarkers writeMarkers = WriteMarkersFactory.get(markerType, table, commitInstant.getTimestamp());
List<String> markerPaths = new ArrayList<>(writeMarkers.allMarkerFilePaths());
if (deletePartialMarkerFiles) {
String toDeleteMarkerFile = markerPaths.get(0);
table.getMetaClient().getFs().delete(new Path(table.getMetaClient().getTempFolderPath() + "/" + commitInstant.getTimestamp() + "/" + toDeleteMarkerFile));
markerPaths.remove(toDeleteMarkerFile);
}
// set hoodie.table.version to fromVersion in hoodie.properties file
HoodieTableVersion toVersion = HoodieTableVersion.ZERO;
if (fromVersion == HoodieTableVersion.TWO) {
prepForDowngradeFromTwoToOne();
toVersion = HoodieTableVersion.ONE;
} else {
prepForDowngradeFromOneToZero();
}
// downgrade should be performed. all marker files should be deleted
new UpgradeDowngrade(metaClient, cfg, context, SparkUpgradeDowngradeHelper.getInstance()).run(toVersion, null);
if (fromVersion == HoodieTableVersion.TWO) {
// assert marker files
assertMarkerFilesForDowngrade(table, commitInstant, toVersion == HoodieTableVersion.ONE);
}
// verify hoodie.table.version got downgraded
metaClient = HoodieTableMetaClient.builder().setConf(context.getHadoopConf().get()).setBasePath(cfg.getBasePath()).setLayoutVersion(Option.of(new TimelineLayoutVersion(cfg.getTimelineLayoutVersion()))).build();
assertEquals(metaClient.getTableConfig().getTableVersion().versionCode(), toVersion.versionCode());
assertTableVersionFromPropertyFile(toVersion);
// trigger 3rd commit with marker based rollback disabled.
/* HUDI-2310
List<HoodieRecord> thirdBatch = triggerCommit("003", tableType, false);
// Check the entire dataset has all records only from 1st commit and 3rd commit since 2nd is expected to be rolledback.
assertRows(inputRecords.getKey(), thirdBatch);
*/
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class HoodieClientTestHarness method validateMetadata.
/**
* Validate the metadata tables contents to ensure it matches what is on the file system.
*/
public void validateMetadata(HoodieTestTable testTable, List<String> inflightCommits, HoodieWriteConfig writeConfig, String metadataTableBasePath, boolean doFullValidation) throws IOException {
HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
assertNotNull(tableMetadata, "MetadataReader should have been initialized");
if (!writeConfig.isMetadataTableEnabled()) {
return;
}
if (!tableMetadata.getSyncedInstantTime().isPresent() || tableMetadata instanceof FileSystemBackedTableMetadata) {
throw new IllegalStateException("Metadata should have synced some commits or tableMetadata should not be an instance " + "of FileSystemBackedTableMetadata");
}
assertEquals(inflightCommits, testTable.inflightCommits());
HoodieTimer timer = new HoodieTimer().startTimer();
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Partitions should match
List<java.nio.file.Path> fsPartitionPaths = testTable.getAllPartitionPaths();
List<String> fsPartitions = new ArrayList<>();
fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString()));
if (fsPartitions.isEmpty()) {
fsPartitions.add("");
}
List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
Collections.sort(fsPartitions);
Collections.sort(metadataPartitions);
assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
assertEquals(fsPartitions, metadataPartitions, "Partitions should match");
// Files within each partition should match
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable table = HoodieSparkTable.create(writeConfig, engineContext, true);
TableFileSystemView tableView = table.getHoodieView();
List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
assertEquals(fsPartitions.size(), partitionToFilesMap.size());
fsPartitions.forEach(partition -> {
try {
validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition);
} catch (IOException e) {
fail("Exception should not be raised: " + e);
}
});
if (doFullValidation) {
runFullValidation(writeConfig, metadataTableBasePath, engineContext);
}
LOG.info("Validation time=" + timer.endTimer());
}
use of org.apache.hudi.table.HoodieTable in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableClustering method doClusteringAndValidate.
private void doClusteringAndValidate(SparkRDDWriteClient client, String clusteringCommitTime, HoodieTableMetaClient metaClient, HoodieWriteConfig cfg, HoodieTestDataGenerator dataGen) {
client.cluster(clusteringCommitTime, true);
metaClient = HoodieTableMetaClient.reload(metaClient);
final HoodieTable clusteredTable = HoodieSparkTable.create(cfg, context(), metaClient);
clusteredTable.getHoodieView().sync();
Stream<HoodieBaseFile> dataFilesToRead = Arrays.stream(dataGen.getPartitionPaths()).flatMap(p -> clusteredTable.getBaseFileOnlyView().getLatestBaseFiles(p));
assertEquals(dataGen.getPartitionPaths().length, dataFilesToRead.count());
HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants();
assertEquals(1, timeline.findInstantsAfter("003", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
assertEquals(clusteringCommitTime, timeline.lastInstant().get().getTimestamp());
assertEquals(HoodieTimeline.REPLACE_COMMIT_ACTION, timeline.lastInstant().get().getAction());
if (cfg.populateMetaFields()) {
assertEquals(400, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.of("000")), "Must contain 200 records");
} else {
assertEquals(400, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.empty()));
}
}
Aggregations