use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testSimpleTagLocationAndUpdateWithRollback.
@Test
public void testSimpleTagLocationAndUpdateWithRollback() throws Exception {
// Load to memory
HoodieWriteConfig config = getConfigBuilder(100, false, false).withRollbackUsingMarkers(false).build();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
final String newCommitTime = writeClient.startCommit();
final int numRecords = 10;
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
metaClient = HoodieTableMetaClient.reload(metaClient);
// Insert 200 records
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
assertNoWriteErrors(writeStatues.collect());
// commit this upsert
writeClient.commit(newCommitTime, writeStatues);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
// Now tagLocation for these records, hbaseIndex should tag them
List<HoodieRecord> records2 = tagLocation(index, writeRecords, hoodieTable).collect();
assertEquals(numRecords, records2.stream().filter(HoodieRecord::isCurrentLocationKnown).count());
// check tagged records are tagged with correct fileIds
List<String> fileIds = writeStatues.map(WriteStatus::getFileId).collect();
assertEquals(0, records2.stream().filter(record -> record.getCurrentLocation().getFileId() == null).count());
List<String> taggedFileIds = records2.stream().map(record -> record.getCurrentLocation().getFileId()).distinct().collect(Collectors.toList());
// both lists should match
assertTrue(taggedFileIds.containsAll(fileIds) && fileIds.containsAll(taggedFileIds));
// Rollback the last commit
writeClient.rollback(newCommitTime);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
// Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled
// back commit
List<HoodieRecord> records3 = tagLocation(index, writeRecords, hoodieTable).collect();
assertEquals(0, records3.stream().filter(HoodieRecord::isCurrentLocationKnown).count());
assertEquals(0, records3.stream().filter(record -> record.getCurrentLocation() != null).count());
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testSmallBatchSize.
@Test
public void testSmallBatchSize() throws Exception {
final String newCommitTime = "001";
final int numRecords = 10;
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
// Load to memory
HoodieWriteConfig config = getConfig(2);
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
// Test tagLocation without any entries in index
JavaRDD<HoodieRecord> records1 = tagLocation(index, writeRecords, hoodieTable);
assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count());
// Insert 200 records
writeClient.startCommitWithTime(newCommitTime);
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
assertNoWriteErrors(writeStatues.collect());
// Now tagLocation for these records, hbaseIndex should not tag them since it was a failed
// commit
JavaRDD<HoodieRecord> records2 = tagLocation(index, writeRecords, hoodieTable);
assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count());
// Now commit this & update location of records inserted and validate no errors
writeClient.commit(newCommitTime, writeStatues);
// Now tagLocation for these records, hbaseIndex should tag them correctly
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(config, context, metaClient);
List<HoodieRecord> records3 = tagLocation(index, writeRecords, hoodieTable).collect();
assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count());
assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count());
assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestSparkHoodieHBaseIndex method testTotalPutsBatching.
@Test
public void testTotalPutsBatching() throws Exception {
HoodieWriteConfig config = getConfig();
SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
// start a commit and generate test data
String newCommitTime = writeClient.startCommit();
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 250);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
// Insert 200 records
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
// commit this upsert
writeClient.commit(newCommitTime, writeStatues);
// Mock hbaseConnection and related entities
Connection hbaseConnection = mock(Connection.class);
HTable table = mock(HTable.class);
when(hbaseConnection.getTable(TableName.valueOf(TABLE_NAME))).thenReturn(table);
when(table.get((List<Get>) any())).thenReturn(new Result[0]);
// only for test, set the hbaseConnection to mocked object
index.setHbaseConnection(hbaseConnection);
// Get all the files generated
int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count();
updateLocation(index, writeStatues, hoodieTable);
// 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated,
// so each fileId ideally gets updates
verify(table, atMost(numberOfDataFileIds)).put((List<Put>) any());
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class BootstrapExecutor method execute.
/**
* Executes Bootstrap.
*/
public void execute() throws IOException {
initializeTable();
SparkRDDWriteClient bootstrapClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(jssc), bootstrapConfig);
try {
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
checkpointCommitMetadata.put(HoodieDeltaStreamer.CHECKPOINT_KEY, cfg.checkpoint);
if (cfg.checkpoint != null) {
checkpointCommitMetadata.put(HoodieDeltaStreamer.CHECKPOINT_RESET_KEY, cfg.checkpoint);
}
bootstrapClient.bootstrap(Option.of(checkpointCommitMetadata));
syncHive();
} finally {
bootstrapClient.close();
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class HoodieCleaner method run.
public void run() {
HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
SparkRDDWriteClient client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jssc), hoodieCfg);
client.clean();
}
Aggregations