use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieMergeHandle method testHoodieMergeHandleWriteStatMetrics.
@ParameterizedTest
@MethodSource("testArguments")
public void testHoodieMergeHandleWriteStatMetrics(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
// insert 100 records
// Build a common config with diff configs
Properties properties = new Properties();
properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
HoodieWriteConfig config = getConfigBuilder().withProperties(properties).build();
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
String newCommitTime = "100";
writeClient.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
// All records should be inserts into new parquet
assertTrue(statuses.stream().filter(status -> status.getStat().getPrevCommit() != HoodieWriteStat.NULL_COMMIT).count() > 0);
// Num writes should be equal to the number of records inserted
assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get());
// Num update writes should be equal to the number of records updated
assertEquals(0, (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get());
// Num update writes should be equal to the number of insert records converted to updates as part of small file
// handling
assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get());
// Update all the 100 records
metaClient = HoodieTableMetaClient.reload(metaClient);
newCommitTime = "101";
writeClient.startCommitWithTime(newCommitTime);
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
statuses = writeClient.upsert(updatedRecordsRDD, newCommitTime).collect();
// All records should be upserts into existing parquet
assertEquals(0, statuses.stream().filter(status -> status.getStat().getPrevCommit() == HoodieWriteStat.NULL_COMMIT).count());
// Num writes should be equal to the number of records inserted
assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get());
// Num update writes should be equal to the number of records updated
assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get());
// Num update writes should be equal to the number of insert records converted to updates as part of small file
// handling
assertEquals(0, (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get());
newCommitTime = "102";
writeClient.startCommitWithTime(newCommitTime);
List<HoodieRecord> allRecords = dataGen.generateInserts(newCommitTime, 100);
allRecords.addAll(updatedRecords);
JavaRDD<HoodieRecord> allRecordsRDD = jsc.parallelize(allRecords, 1);
statuses = writeClient.upsert(allRecordsRDD, newCommitTime).collect();
// All records should be upserts into existing parquet (with inserts as updates small file handled)
assertEquals(0, (long) statuses.stream().filter(status -> status.getStat().getPrevCommit() == HoodieWriteStat.NULL_COMMIT).count());
// Num writes should be equal to the total number of records written
assertEquals(200, (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get());
// Num update writes should be equal to the number of records updated (including inserts converted as updates)
assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get());
// Num update writes should be equal to the number of insert records converted to updates as part of small file
// handling
assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get());
// Verify all records have location set
statuses.forEach(writeStatus -> {
writeStatus.getWrittenRecords().forEach(r -> {
// Ensure New Location is set
assertTrue(r.getNewLocation().isPresent());
});
});
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testSimpleTagLocationWithInvalidCommit.
/*
* Test case to verify that for taglocation entries present in HBase, if the corresponding commit instant is missing
* in timeline and the commit is not archived, taglocation would reset the current record location to null.
*/
@Test
public void testSimpleTagLocationWithInvalidCommit() throws Exception {
// Load to memory
HoodieWriteConfig config = getConfigBuilder(100, false, false).withRollbackUsingMarkers(false).build();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
String newCommitTime = writeClient.startCommit();
// make a commit with 199 records
JavaRDD<HoodieRecord> writeRecords = generateAndCommitRecords(writeClient, 199, newCommitTime);
// make a second commit with a single record
String invalidCommit = writeClient.startCommit();
JavaRDD<HoodieRecord> invalidWriteRecords = generateAndCommitRecords(writeClient, 1, invalidCommit);
// verify location is tagged.
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> javaRDD0 = tagLocation(index, invalidWriteRecords, hoodieTable);
// one record present
assert (javaRDD0.collect().size() == 1);
// it is tagged
assert (javaRDD0.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 1);
assert (javaRDD0.collect().get(0).getCurrentLocation().getInstantTime().equals(invalidCommit));
// rollback the invalid commit, so that hbase will be left with a stale entry.
writeClient.rollback(invalidCommit);
// Now tagLocation for the valid records, hbaseIndex should tag them
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords, hoodieTable);
assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 199);
// tagLocation for the invalid record - commit is not present in timeline due to rollback.
JavaRDD<HoodieRecord> javaRDD2 = tagLocation(index, invalidWriteRecords, hoodieTable);
// one record present
assert (javaRDD2.collect().size() == 1);
// it is not tagged
assert (javaRDD2.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0);
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testEnsureTagLocationUsesCommitTimeline.
/*
* Test case to verify that taglocation() uses the commit timeline to validate the commitTS stored in hbase.
* When CheckIfValidCommit() in HbaseIndex uses the incorrect timeline filtering, this test would fail.
*/
@Test
public void testEnsureTagLocationUsesCommitTimeline() throws Exception {
// Load to memory
HoodieWriteConfig config = getConfigBuilder(100, false, false).withRollbackUsingMarkers(false).build();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
String commitTime1 = writeClient.startCommit();
JavaRDD<HoodieRecord> writeRecords1 = generateAndCommitRecords(writeClient, 20, commitTime1);
// rollback the commit - leaves a clean file in timeline.
writeClient.rollback(commitTime1);
// create a second commit with 20 records
metaClient = HoodieTableMetaClient.reload(metaClient);
generateAndCommitRecords(writeClient, 20);
// Now tagLocation for the first set of rolledback records, hbaseIndex should tag them
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords1, hoodieTable);
assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 20);
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testSimpleTagLocationAndUpdate.
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testSimpleTagLocationAndUpdate(HoodieTableType tableType) throws Exception {
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType);
final String newCommitTime = "001";
final int numRecords = 10;
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
// Load to memory
HoodieWriteConfig config = getConfig();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
// Test tagLocation without any entries in index
JavaRDD<HoodieRecord> records1 = tagLocation(index, writeRecords, hoodieTable);
assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count());
// Insert 200 records
writeClient.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
assertNoWriteErrors(writeStatues.collect());
// Now tagLocation for these records, hbaseIndex should not tag them since commit never occurred
JavaRDD<HoodieRecord> records2 = tagLocation(index, writeRecords, hoodieTable);
assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count());
// Now commit this & update location of records inserted and validate no errors
writeClient.commit(newCommitTime, writeStatues);
// Now tagLocation for these records, hbaseIndex should tag them correctly
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
List<HoodieRecord> records3 = tagLocation(index, writeRecords, hoodieTable).collect();
assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count());
assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count());
assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testHbaseTagLocationForArchivedCommits.
// Verify hbase is tagging records belonging to an archived commit as valid.
@Test
public void testHbaseTagLocationForArchivedCommits() throws Exception {
// Load to memory
Map<String, String> params = new HashMap<String, String>();
params.put(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED.key(), "1");
params.put(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP.key(), "3");
params.put(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP.key(), "2");
HoodieWriteConfig config = getConfigBuilder(100, false, false).withProps(params).build();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
// make first commit with 20 records
JavaRDD<HoodieRecord> writeRecords1 = generateAndCommitRecords(writeClient, 20);
// Make 3 additional commits, so that first commit is archived
for (int nCommit = 0; nCommit < 3; nCommit++) {
generateAndCommitRecords(writeClient, 20);
}
// tagLocation for the first set of records (for the archived commit), hbaseIndex should tag them as valid
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords1, hoodieTable);
assertEquals(20, javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size());
}
Aggregations