use of org.apache.hudi.common.testutils.Transformations in project hudi by apache.
the class TestCopyOnWriteActionExecutor method testInsertUpsertWithHoodieAvroPayload.
@Test
public void testInsertUpsertWithHoodieAvroPayload() throws Exception {
Schema schema = getSchemaFromResource(TestCopyOnWriteActionExecutor.class, "/testDataGeneratorSchema.txt");
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schema.toString()).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withRemoteServerPort(timelineServicePort).build()).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1000 * 1024).hfileMaxFileSize(1000 * 1024).build()).build();
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
String instantTime = "000";
// Perform inserts of 100 records to test CreateHandle and BufferedExecutor
final List<HoodieRecord> inserts = dataGen.generateInsertsWithHoodieAvroPayload(instantTime, 100);
BaseSparkCommitActionExecutor actionExecutor = new SparkInsertCommitActionExecutor(context, config, table, instantTime, context.parallelize(inserts));
final List<List<WriteStatus>> ws = jsc.parallelize(Arrays.asList(1)).map(x -> {
return actionExecutor.handleInsert(UUID.randomUUID().toString(), inserts.iterator());
}).map(Transformations::flatten).collect();
WriteStatus writeStatus = ws.get(0).get(0);
String fileId = writeStatus.getFileId();
metaClient.getFs().create(new Path(Paths.get(basePath, ".hoodie", "000.commit").toString())).close();
final List<HoodieRecord> updates = dataGen.generateUpdatesWithHoodieAvroPayload(instantTime, inserts);
String partitionPath = writeStatus.getPartitionPath();
long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count();
table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, HoodieTableMetaClient.reload(metaClient));
BaseSparkCommitActionExecutor newActionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, instantTime, context.parallelize(updates));
final List<List<WriteStatus>> updateStatus = jsc.parallelize(Arrays.asList(1)).map(x -> {
return newActionExecutor.handleUpdate(partitionPath, fileId, updates.iterator());
}).map(Transformations::flatten).collect();
assertEquals(updates.size() - numRecordsInPartition, updateStatus.get(0).get(0).getTotalErrorRecords());
}
use of org.apache.hudi.common.testutils.Transformations in project hudi by apache.
the class TestHoodieMergeOnReadTable method testHandleUpdateWithMultiplePartitions.
/**
* Test to validate invoking table.handleUpdate() with input records from multiple partitions will fail.
*/
@Test
public void testHandleUpdateWithMultiplePartitions() throws Exception {
HoodieWriteConfig cfg = getConfig(true);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
/**
* Write 1 (only inserts, written as base file)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieSparkMergeOnReadTable hoodieTable = (HoodieSparkMergeOnReadTable) HoodieSparkTable.create(cfg, context(), metaClient);
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
assertFalse(dataFilesToRead.findAny().isPresent());
roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFilesToRead = roView.getLatestBaseFiles();
assertTrue(dataFilesToRead.findAny().isPresent(), "should list the base files we wrote in the delta commit");
/**
* Write 2 (only updates, written to .log file)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
metaClient.reloadActiveTimeline();
records = dataGen.generateUpdates(newCommitTime, records);
writeRecords = jsc().parallelize(records, 1);
statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
/**
* Write 3 (only deletes, written to .log file)
*/
final String newDeleteTime = "004";
final String partitionPath = records.get(0).getPartitionPath();
final String fileId = statuses.get(0).getFileId();
client.startCommitWithTime(newDeleteTime);
metaClient.reloadActiveTimeline();
List<HoodieRecord> fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records);
JavaRDD<HoodieRecord> deleteRDD = jsc().parallelize(fewRecordsForDelete, 1);
// initialize partitioner
hoodieTable.getHoodieView().sync();
BaseSparkDeltaCommitActionExecutor actionExecutor = new SparkDeleteDeltaCommitActionExecutor(context(), cfg, hoodieTable, newDeleteTime, HoodieJavaRDD.of(deleteRDD));
actionExecutor.getUpsertPartitioner(new WorkloadProfile(buildProfile(deleteRDD)));
final List<List<WriteStatus>> deleteStatus = jsc().parallelize(Arrays.asList(1)).map(x -> {
return actionExecutor.handleUpdate(partitionPath, fileId, fewRecordsForDelete.iterator());
}).map(Transformations::flatten).collect();
// Verify there are errors because records are from multiple partitions (but handleUpdate is invoked for
// specific partition)
WriteStatus status = deleteStatus.get(0).get(0);
assertTrue(status.hasErrors());
long numRecordsInPartition = fewRecordsForDelete.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count();
assertEquals(fewRecordsForDelete.size() - numRecordsInPartition, status.getTotalErrorRecords());
}
}
use of org.apache.hudi.common.testutils.Transformations in project hudi by apache.
the class TestCopyOnWriteActionExecutor method testInsertRecords.
@Test
public void testInsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfig();
String instantTime = makeNewCommitTime();
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
// Case 1:
// 10 records for partition 1, 1 record for partition 2.
List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
// Insert new records
final List<HoodieRecord> recs2 = records;
BaseSparkCommitActionExecutor actionExecutor = new SparkInsertPreppedCommitActionExecutor(context, config, table, instantTime, context.parallelize(recs2));
List<WriteStatus> returnedStatuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
return actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs2.iterator());
}).flatMap(Transformations::flattenAsIterator).collect();
// TODO: check the actual files and make sure 11 records, total were written.
assertEquals(2, returnedStatuses.size());
Map<String, Long> expectedPartitionNumRecords = new HashMap<>();
expectedPartitionNumRecords.put("2016/01/31", 10L);
expectedPartitionNumRecords.put("2016/02/01", 1L);
verifyStatusResult(returnedStatuses, expectedPartitionNumRecords);
// Case 2:
// 1 record for partition 1, 5 record for partition 2, 1 records for partition 3.
records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z");
records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z"));
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
// Insert new records
final List<HoodieRecord> recs3 = records;
BaseSparkCommitActionExecutor newActionExecutor = new SparkUpsertPreppedCommitActionExecutor(context, config, table, instantTime, context.parallelize(recs3));
returnedStatuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
return newActionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs3.iterator());
}).flatMap(Transformations::flattenAsIterator).collect();
assertEquals(3, returnedStatuses.size());
expectedPartitionNumRecords.clear();
expectedPartitionNumRecords.put("2016/01/31", 1L);
expectedPartitionNumRecords.put("2016/02/01", 5L);
expectedPartitionNumRecords.put("2016/02/02", 1L);
verifyStatusResult(returnedStatuses, expectedPartitionNumRecords);
}
Aggregations