use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class SparkHoodieBackedTableMetadataWriter method commit.
@Override
protected void commit(String instantTime, Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionRecordsMap, boolean canTriggerTableService) {
ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet.");
ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled");
HoodieData<HoodieRecord> preppedRecords = prepRecords(partitionRecordsMap);
JavaRDD<HoodieRecord> preppedRecordRDD = HoodieJavaRDD.getJavaRDD(preppedRecords);
try (SparkRDDWriteClient writeClient = new SparkRDDWriteClient(engineContext, metadataWriteConfig, true)) {
if (canTriggerTableService) {
// trigger compaction before doing the delta commit. this is to ensure, if this delta commit succeeds in metadata table, but failed in data table,
// we would have compacted metadata table and so could have included uncommitted data which will never be ignored while reading from metadata
// table (since reader will filter out only from delta commits)
compactIfNecessary(writeClient, instantTime);
}
if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) {
// if this is a new commit being applied to metadata for the first time
writeClient.startCommitWithTime(instantTime);
} else {
// this code path refers to a re-attempted commit that got committed to metadata table, but failed in datatable.
// for eg, lets say compaction c1 on 1st attempt succeeded in metadata table and failed before committing to datatable.
// when retried again, data table will first rollback pending compaction. these will be applied to metadata table, but all changes
// are upserts to metadata table and so only a new delta commit will be created.
// once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is
// already part of completed commit. So, we have to manually remove the completed instant and proceed.
// and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table.
HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get();
HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant);
metadataMetaClient.reloadActiveTimeline();
}
List<WriteStatus> statuses = writeClient.upsertPreppedRecords(preppedRecordRDD, instantTime).collect();
statuses.forEach(writeStatus -> {
if (writeStatus.hasErrors()) {
throw new HoodieMetadataException("Failed to commit metadata table records at instant " + instantTime);
}
});
// reload timeline
metadataMetaClient.reloadActiveTimeline();
if (canTriggerTableService) {
cleanIfNecessary(writeClient, instantTime);
writeClient.archive();
}
}
// Update total size of the metadata and count of base/log files
metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata));
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestDFSHoodieDatasetInputReader method testSimpleHoodieDatasetReader.
@Test
public void testSimpleHoodieDatasetReader() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfig();
SparkRDDWriteClient client = new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config);
String commitTime = client.startCommit();
HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
// Insert 100 records across 3 partitions
List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
writeStatuses.count();
DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(), HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
// Try to read 100 records for the same partition path and same file ID
JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 1);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 1);
// Try to read 100 records for 3 partition paths and 3 different file ids
records = reader.read(3, 3, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
// Try to read 100 records for 3 partition paths and 50% records from each file
records = reader.read(3, 3, 0.5);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class HoodieWriteClientExample method main.
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName>");
System.exit(1);
}
String tablePath = args[0];
String tableName = args[1];
SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example");
try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) {
// Generator of some records to be loaded in.
HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
// initialize the table, if not done already
Path path = new Path(tablePath);
FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
if (!fs.exists(path)) {
HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClass(HoodieAvroPayload.class).initTable(jsc.hadoopConfiguration(), tablePath);
}
// Create the write client to write some records in
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build();
SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
// inserts
String newCommitTime = client.startCommit();
LOG.info("Starting commit " + newCommitTime);
List<HoodieRecord<HoodieAvroPayload>> records = dataGen.generateInserts(newCommitTime, 10);
List<HoodieRecord<HoodieAvroPayload>> recordsSoFar = new ArrayList<>(records);
JavaRDD<HoodieRecord<HoodieAvroPayload>> writeRecords = jsc.parallelize(records, 1);
client.insert(writeRecords, newCommitTime);
// updates
newCommitTime = client.startCommit();
LOG.info("Starting commit " + newCommitTime);
List<HoodieRecord<HoodieAvroPayload>> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2);
records.addAll(toBeUpdated);
recordsSoFar.addAll(toBeUpdated);
writeRecords = jsc.parallelize(records, 1);
client.upsert(writeRecords, newCommitTime);
// Delete
newCommitTime = client.startCommit();
LOG.info("Starting commit " + newCommitTime);
// just delete half of the records
int numToDelete = recordsSoFar.size() / 2;
List<HoodieKey> toBeDeleted = recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList());
JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(toBeDeleted, 1);
client.delete(deleteRecords, newCommitTime);
// Delete by partition
newCommitTime = client.startCommit();
client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION);
LOG.info("Starting commit " + newCommitTime);
// The partition where the data needs to be deleted
List<String> partitionList = toBeDeleted.stream().map(s -> s.getPartitionPath()).distinct().collect(Collectors.toList());
List<String> deleteList = recordsSoFar.stream().filter(f -> !partitionList.contains(f.getPartitionPath())).map(m -> m.getKey().getPartitionPath()).distinct().collect(Collectors.toList());
client.deletePartitions(deleteList, newCommitTime);
// compaction
if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) {
Option<String> instant = client.scheduleCompaction(Option.empty());
HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(instant.get());
client.commitCompaction(instant.get(), compactionMetadata.getCommitMetadata().get(), Option.empty());
}
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieSnapshotExporter method init.
@BeforeEach
public void init() throws Exception {
// Initialize test data dirs
sourcePath = Paths.get(basePath(), "source").toString();
targetPath = Paths.get(basePath(), "target").toString();
lfs = (LocalFileSystem) FSUtils.getFs(basePath(), jsc().hadoopConfiguration());
HoodieTableMetaClient.withPropertyBuilder().setTableType(HoodieTableType.COPY_ON_WRITE).setTableName(TABLE_NAME).setPayloadClass(HoodieAvroPayload.class).initTable(jsc().hadoopConfiguration(), sourcePath);
// Prepare data as source Hudi dataset
HoodieWriteConfig cfg = getHoodieWriteConfig(sourcePath);
SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg);
writeClient.startCommitWithTime(COMMIT_TIME);
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] { PARTITION_PATH });
List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS);
JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
writeClient.bulkInsert(recordsRDD, COMMIT_TIME);
writeClient.close();
RemoteIterator<LocatedFileStatus> itr = lfs.listFiles(new Path(sourcePath), true);
while (itr.hasNext()) {
LOG.info(">>> Prepared test file: " + itr.next().getPath());
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieIncrSource method testHoodieIncrSource.
@Test
public void testHoodieIncrSource() throws IOException {
HoodieWriteConfig writeConfig = getConfigBuilder(basePath).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).retainCommits(1).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).build();
SparkRDDWriteClient writeClient = new SparkRDDWriteClient(context, writeConfig);
Pair<String, List<HoodieRecord>> inserts = writeRecords(writeClient, true, null, "100");
Pair<String, List<HoodieRecord>> inserts2 = writeRecords(writeClient, true, null, "200");
Pair<String, List<HoodieRecord>> inserts3 = writeRecords(writeClient, true, null, "300");
Pair<String, List<HoodieRecord>> inserts4 = writeRecords(writeClient, true, null, "400");
Pair<String, List<HoodieRecord>> inserts5 = writeRecords(writeClient, true, null, "500");
// read everything upto latest
readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.empty(), 500, inserts5.getKey());
// even if the begin timestamp is archived (100), full table scan should kick in, but should filter for records having commit time > 100
readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.of("100"), 400, inserts5.getKey());
// even if the read upto latest is set, if begin timestamp is in active timeline, only incremental should kick in.
readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_UPTO_LATEST_COMMIT, Option.of("400"), 100, inserts5.getKey());
// read just the latest
readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.empty(), 100, inserts5.getKey());
// ensure checkpoint does not move
readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.of(inserts5.getKey()), 0, inserts5.getKey());
Pair<String, List<HoodieRecord>> inserts6 = writeRecords(writeClient, true, null, "600");
// insert new batch and ensure the checkpoint moves
readAndAssert(IncrSourceHelper.MissingCheckpointStrategy.READ_LATEST, Option.of(inserts5.getKey()), 100, inserts6.getKey());
}
Aggregations