use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableClustering method testClustering.
@ParameterizedTest
@MethodSource
void testClustering(boolean doUpdates, boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception {
// set low compaction small File Size to generate more file groups.
HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().forTable("test-trip-table").withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).withAutoCommit(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10L).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClustering(true).withInlineClusteringNumCommits(1).withPreserveHoodieCommitMetadata(preserveCommitMetadata).build()).withRollbackUsingMarkers(false);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, cfg.getProps());
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
/*
* Write 1 (only inserts)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 400);
Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records.subList(0, 200), client, cfg, newCommitTime);
assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
/*
* Write 2 (more inserts to create new files)
*/
// we already set small file size to small number to force inserts to go into new file.
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
dataFiles = insertRecordsToMORTable(metaClient, records.subList(200, 400), client, cfg, newCommitTime);
assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
if (doUpdates) {
/*
* Write 3 (updates)
*/
newCommitTime = "003";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, 100);
updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
}
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
hoodieTable.getHoodieView().sync();
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
// expect 2 base files for each partition
assertEquals(dataGen.getPartitionPaths().length * 2, allFiles.length);
String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
// verify all files are included in clustering plan.
assertEquals(allFiles.length, hoodieTable.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getLeft).count());
// Do the clustering and validate
doClusteringAndValidate(client, clusteringCommitTime, metaClient, cfg, dataGen);
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestHoodieKeyLocationFetchHandle method testFetchHandle.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testFetchHandle(boolean populateMetaFields) throws Exception {
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
config = getConfigBuilder().withProperties(getPropertiesForKeyGen()).withIndexConfig(HoodieIndexConfig.newBuilder().build()).build();
List<HoodieRecord> records = dataGen.generateInserts(makeNewCommitTime(), 100);
Map<String, List<HoodieRecord>> partitionRecordsMap = recordsToPartitionRecordsMap(records);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, AVRO_SCHEMA_WITH_METADATA_FIELDS);
Map<Tuple2<String, String>, List<Tuple2<HoodieKey, HoodieRecordLocation>>> expectedList = writeToParquetAndGetExpectedRecordLocations(partitionRecordsMap, testTable);
List<Tuple2<String, HoodieBaseFile>> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable);
BaseKeyGenerator keyGenerator = (BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getPropertiesForKeyGen()));
for (Tuple2<String, HoodieBaseFile> entry : partitionPathFileIdPairs) {
HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2), populateMetaFields ? Option.empty() : Option.of(keyGenerator));
Iterator<Pair<HoodieKey, HoodieRecordLocation>> result = fetcherHandle.locations().iterator();
List<Tuple2<HoodieKey, HoodieRecordLocation>> actualList = new ArrayList<>();
result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight())));
assertEquals(expectedList.get(new Tuple2<>(entry._1, entry._2.getFileId())), actualList);
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class DeltaSync method readFromSource.
/**
* Read from Upstream Source and apply transformation if needed.
*
* @param commitTimelineOpt Timeline with completed commits
* @return Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> Input data read from upstream source, consists
* of schemaProvider, checkpointStr and hoodieRecord
* @throws Exception in case of any Exception
*/
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
// Retrieve the previous round checkpoints, if any
Option<String> resumeCheckpointStr = Option.empty();
if (commitTimelineOpt.isPresent()) {
resumeCheckpointStr = getCheckpointToResume(commitTimelineOpt);
} else {
// initialize the table for the first time.
String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
HoodieTableMetaClient.withPropertyBuilder().setTableType(cfg.tableType).setTableName(cfg.targetTableName).setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()).setPayloadClassName(cfg.payloadClassName).setBaseFileFormat(cfg.baseFileFormat).setPartitionFields(partitionColumns).setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())).setPopulateMetaFields(props.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())).setKeyGeneratorClassProp(props.getProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), SimpleKeyGenerator.class.getName())).initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath);
}
LOG.debug("Checkpoint from config: " + cfg.checkpoint);
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
resumeCheckpointStr = Option.of(cfg.checkpoint);
}
LOG.info("Checkpoint to resume from : " + resumeCheckpointStr);
int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1;
int curRetryCount = 0;
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> sourceDataToSync = null;
while (curRetryCount++ < maxRetryCount && sourceDataToSync == null) {
try {
sourceDataToSync = fetchFromSource(resumeCheckpointStr);
} catch (HoodieSourceTimeoutException e) {
if (curRetryCount >= maxRetryCount) {
throw e;
}
try {
LOG.error("Exception thrown while fetching data from source. Msg : " + e.getMessage() + ", class : " + e.getClass() + ", cause : " + e.getCause());
LOG.error("Sleeping for " + (cfg.retryIntervalSecs) + " before retrying again. Current retry count " + curRetryCount + ", max retry count " + cfg.maxRetryCount);
Thread.sleep(cfg.retryIntervalSecs * 1000);
} catch (InterruptedException ex) {
LOG.error("Ignoring InterruptedException while waiting to retry on source failure " + e.getMessage());
}
}
}
return sourceDataToSync;
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class DeltaSync method syncOnce.
/**
* Run one round of delta sync and return new compaction instant if one got scheduled.
*/
public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException {
Pair<Option<String>, JavaRDD<WriteStatus>> result = null;
Timer.Context overallTimerContext = metrics.getOverallTimerContext();
// Refresh Timeline
refreshTimeline();
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> srcRecordsWithCkpt = readFromSource(commitTimelineOpt);
if (null != srcRecordsWithCkpt) {
// compactor
if (null == writeClient) {
this.schemaProvider = srcRecordsWithCkpt.getKey();
// Setup HoodieWriteClient and compaction now that we decided on schema
setupWriteClient();
} else {
Schema newSourceSchema = srcRecordsWithCkpt.getKey().getSourceSchema();
Schema newTargetSchema = srcRecordsWithCkpt.getKey().getTargetSchema();
if (!(processedSchema.isSchemaPresent(newSourceSchema)) || !(processedSchema.isSchemaPresent(newTargetSchema))) {
LOG.info("Seeing new schema. Source :" + newSourceSchema.toString(true) + ", Target :" + newTargetSchema.toString(true));
// We need to recreate write client with new schema and register them.
reInitWriteClient(newSourceSchema, newTargetSchema);
processedSchema.addSchema(newSourceSchema);
processedSchema.addSchema(newTargetSchema);
}
}
// complete the pending clustering before writing to sink
if (cfg.retryLastPendingInlineClusteringJob && getHoodieClientConfig(this.schemaProvider).inlineClusteringEnabled()) {
Option<String> pendingClusteringInstant = getLastPendingClusteringInstant(allCommitsTimelineOpt);
if (pendingClusteringInstant.isPresent()) {
writeClient.cluster(pendingClusteringInstant.get(), true);
}
}
result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext);
}
metrics.updateDeltaStreamerSyncMetrics(System.currentTimeMillis());
// Clear persistent RDDs
jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist);
return result;
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class S3EventsHoodieIncrSource method fetchNextBatch.
@Override
public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {
DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(HOODIE_SRC_BASE_PATH));
String srcPath = props.getString(HOODIE_SRC_BASE_PATH);
int numInstantsPerFetch = props.getInteger(NUM_INSTANTS_PER_FETCH, DEFAULT_NUM_INSTANTS_PER_FETCH);
boolean readLatestOnMissingCkpt = props.getBoolean(READ_LATEST_INSTANT_ON_MISSING_CKPT, DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);
IncrSourceHelper.MissingCheckpointStrategy missingCheckpointStrategy = (props.containsKey(HoodieIncrSource.Config.MISSING_CHECKPOINT_STRATEGY)) ? IncrSourceHelper.MissingCheckpointStrategy.valueOf(props.getString(HoodieIncrSource.Config.MISSING_CHECKPOINT_STRATEGY)) : null;
if (readLatestOnMissingCkpt) {
missingCheckpointStrategy = IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST;
}
String fileFormat = props.getString(SOURCE_FILE_FORMAT, DEFAULT_SOURCE_FILE_FORMAT);
// Use begin Instant if set and non-empty
Option<String> beginInstant = lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty();
Pair<String, Pair<String, String>> queryTypeAndInstantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath, numInstantsPerFetch, beginInstant, missingCheckpointStrategy);
if (queryTypeAndInstantEndpts.getValue().getKey().equals(queryTypeAndInstantEndpts.getValue().getValue())) {
LOG.warn("Already caught up. Begin Checkpoint was :" + queryTypeAndInstantEndpts.getValue().getKey());
return Pair.of(Option.empty(), queryTypeAndInstantEndpts.getValue().getKey());
}
Dataset<Row> source = null;
// Do incremental pull. Set end instant if available.
if (queryTypeAndInstantEndpts.getKey().equals(DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())) {
source = sparkSession.read().format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()).option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), queryTypeAndInstantEndpts.getRight().getLeft()).option(DataSourceReadOptions.END_INSTANTTIME().key(), queryTypeAndInstantEndpts.getRight().getRight()).load(srcPath);
} else {
// if checkpoint is missing from source table, and if strategy is set to READ_UPTO_LATEST_COMMIT, we have to issue snapshot query
source = sparkSession.read().format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL()).load(srcPath).filter(String.format("%s > '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, queryTypeAndInstantEndpts.getRight().getLeft()));
}
if (source.isEmpty()) {
return Pair.of(Option.empty(), queryTypeAndInstantEndpts.getRight().getRight());
}
String filter = "s3.object.size > 0";
if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_KEY_PREFIX, null))) {
filter = filter + " and s3.object.key like '" + props.getString(Config.S3_KEY_PREFIX) + "%'";
}
if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_PREFIX, null))) {
filter = filter + " and s3.object.key not like '" + props.getString(Config.S3_IGNORE_KEY_PREFIX) + "%'";
}
if (!StringUtils.isNullOrEmpty(props.getString(Config.S3_IGNORE_KEY_SUBSTRING, null))) {
filter = filter + " and s3.object.key not like '%" + props.getString(Config.S3_IGNORE_KEY_SUBSTRING) + "%'";
}
// add file format filtering by default
filter = filter + " and s3.object.key like '%" + fileFormat + "%'";
String s3FS = props.getString(Config.S3_FS_PREFIX, "s3").toLowerCase();
String s3Prefix = s3FS + "://";
// Extract distinct file keys from s3 meta hoodie table
final List<Row> cloudMetaDf = source.filter(filter).select("s3.bucket.name", "s3.object.key").distinct().collectAsList();
// Create S3 paths
final boolean checkExists = props.getBoolean(Config.ENABLE_EXISTS_CHECK, Config.DEFAULT_ENABLE_EXISTS_CHECK);
List<String> cloudFiles = new ArrayList<>();
for (Row row : cloudMetaDf) {
// construct file path, row index 0 refers to bucket and 1 refers to key
String bucket = row.getString(0);
String filePath = s3Prefix + bucket + "/" + row.getString(1);
if (checkExists) {
FileSystem fs = FSUtils.getFs(s3Prefix + bucket, sparkSession.sparkContext().hadoopConfiguration());
try {
if (fs.exists(new Path(filePath))) {
cloudFiles.add(filePath);
}
} catch (IOException e) {
LOG.error(String.format("Error while checking path exists for %s ", filePath), e);
}
} else {
cloudFiles.add(filePath);
}
}
Option<Dataset<Row>> dataset = Option.empty();
if (!cloudFiles.isEmpty()) {
dataset = Option.of(sparkSession.read().format(fileFormat).load(cloudFiles.toArray(new String[0])));
}
return Pair.of(dataset, queryTypeAndInstantEndpts.getRight().getRight());
}
Aggregations