use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieBackedMetadata method testNonPartitioned.
@Test
public void testNonPartitioned() throws Exception {
init(HoodieTableType.COPY_ON_WRITE, false);
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
HoodieTestDataGenerator nonPartitionedGenerator = new HoodieTestDataGenerator(new String[] { "" });
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
// Write 1 (Bulk insert)
String newCommitTime = "0000001";
List<HoodieRecord> records = nonPartitionedGenerator.generateInserts(newCommitTime, 10);
client.startCommitWithTime(newCommitTime);
List<WriteStatus> writeStatuses = client.bulkInsert(jsc.parallelize(records, 1), newCommitTime).collect();
validateMetadata(client);
List<String> metadataPartitions = metadata(client).getAllPartitionPaths();
assertTrue(metadataPartitions.contains(""), "Must contain empty partition");
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testSmallInsertHandlingForUpserts.
/**
* Test scenario of new file-group getting added during upsert().
*/
@Test
public void testSmallInsertHandlingForUpserts() throws Exception {
final String testPartitionPath = "2016/09/26";
final int insertSplitLimit = 100;
// setup the small file handling params
// hold upto 200 records max
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
SparkRDDWriteClient client = getHoodieWriteClient(config);
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
// Inserts => will write file1
String commitTime1 = "001";
client.startCommitWithTime(commitTime1);
// this writes ~500kb
List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit);
Set<String> keys1 = recordsToRecordKeySet(inserts1);
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
assertNoWriteErrors(statuses);
assertEquals(1, statuses.size(), "Just 1 file needs to be added.");
String file1 = statuses.get(0).getFileId();
assertEquals(100, fileUtils.readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())).size(), "file should contain 100 records");
// Update + Inserts such that they just expand file1
String commitTime2 = "002";
client.startCommitWithTime(commitTime2);
List<HoodieRecord> inserts2 = dataGen.generateInserts(commitTime2, 40);
Set<String> keys2 = recordsToRecordKeySet(inserts2);
List<HoodieRecord> insertsAndUpdates2 = new ArrayList<>();
insertsAndUpdates2.addAll(inserts2);
insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1));
JavaRDD<HoodieRecord> insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1);
statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect();
assertNoWriteErrors(statuses);
assertEquals(1, statuses.size(), "Just 1 file needs to be updated.");
assertEquals(file1, statuses.get(0).getFileId(), "Existing file should be expanded");
assertEquals(commitTime1, statuses.get(0).getStat().getPrevCommit(), "Existing file should be expanded");
Path newFile = new Path(basePath, statuses.get(0).getStat().getPath());
assertEquals(140, fileUtils.readRowKeys(hadoopConf, newFile).size(), "file should contain 140 records");
List<GenericRecord> records = fileUtils.readAvroRecords(hadoopConf, newFile);
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit2");
assertTrue(keys2.contains(recordKey) || keys1.contains(recordKey), "key expected to be part of commit2");
}
// update + inserts such that file1 is updated and expanded, a new file2 is created.
String commitTime3 = "003";
client.startCommitWithTime(commitTime3);
List<HoodieRecord> insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 200);
Set<String> keys3 = recordsToRecordKeySet(insertsAndUpdates3);
List<HoodieRecord> updates3 = dataGen.generateUpdates(commitTime3, inserts2);
insertsAndUpdates3.addAll(updates3);
JavaRDD<HoodieRecord> insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1);
statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect();
assertNoWriteErrors(statuses);
assertEquals(2, statuses.size(), "2 files needs to be committed.");
HoodieTableMetaClient metadata = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
HoodieTable table = getHoodieTable(metadata, config);
BaseFileOnlyView fileSystemView = table.getBaseFileOnlyView();
List<HoodieBaseFile> files = fileSystemView.getLatestBaseFilesBeforeOrOn(testPartitionPath, commitTime3).collect(Collectors.toList());
int numTotalInsertsInCommit3 = 0;
int numTotalUpdatesInCommit3 = 0;
for (HoodieBaseFile file : files) {
if (file.getFileName().contains(file1)) {
assertEquals(commitTime3, file.getCommitTime(), "Existing file should be expanded");
records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath()));
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
if (recordCommitTime.equals(commitTime3)) {
if (keys2.contains(recordKey)) {
keys2.remove(recordKey);
numTotalUpdatesInCommit3++;
} else {
numTotalInsertsInCommit3++;
}
}
}
assertEquals(0, keys2.size(), "All keys added in commit 2 must be updated in commit3 correctly");
} else {
assertEquals(commitTime3, file.getCommitTime(), "New file must be written for commit 3");
records = fileUtils.readAvroRecords(hadoopConf, new Path(file.getPath()));
for (GenericRecord record : records) {
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
assertEquals(commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(), "only expect commit3");
assertTrue(keys3.contains(recordKey), "key expected to be part of commit3");
}
numTotalInsertsInCommit3 += records.size();
}
}
assertEquals(numTotalUpdatesInCommit3, inserts2.size(), "Total updates in commit3 must add up");
assertEquals(numTotalInsertsInCommit3, keys3.size(), "Total inserts in commit3 must add up");
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testParallelInsertAndCleanPreviousFailedCommits.
@Test
public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception {
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
ExecutorService service = Executors.newFixedThreadPool(2);
HoodieTestUtils.init(hadoopConf, basePath);
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
// perform 1 successfull write
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true);
// Perform 2 failed writes to table
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false);
client.close();
// refresh data generator to delete records generated from failed commits
dataGen = new HoodieTestDataGenerator();
// Create a succesful commit
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "400", "300", Option.of(Arrays.asList("400")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
commit3.get();
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
assertTrue(metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0);
assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 2);
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
// Await till enough time passes such that the first 2 failed commits heartbeats are expired
boolean conditionMet = false;
while (!conditionMet) {
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("300");
Thread.sleep(2000);
}
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "500", "400", Option.of(Arrays.asList("500")), "500", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)).clean());
commit4.get();
clean1.get();
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 2);
// Since we write rollbacks not clean, there should be no clean action on the timeline
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0);
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3);
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method verifyDeletePartitionsHandling.
/**
* 1) Do write1 (upsert) with 'batch1RecordsCount' number of records for first partition.
* 2) Do write2 (upsert) with 'batch2RecordsCount' number of records for second partition.
* 3) Do write3 (upsert) with 'batch3RecordsCount' number of records for third partition.
* 4) delete first partition and check result.
* 5) delete second and third partition and check result.
*/
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount, boolean populateMetaFields) throws Exception {
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator();
// Do Inserts for DEFAULT_FIRST_PARTITION_PATH
String commitTime1 = "001";
Set<String> batch1Buckets = this.insertPartitionRecordsWithCommit(client, batch1RecordsCount, commitTime1, DEFAULT_FIRST_PARTITION_PATH);
// Do Inserts for DEFAULT_SECOND_PARTITION_PATH
String commitTime2 = "002";
Set<String> batch2Buckets = this.insertPartitionRecordsWithCommit(client, batch2RecordsCount, commitTime2, DEFAULT_SECOND_PARTITION_PATH);
// Do Inserts for DEFAULT_THIRD_PARTITION_PATH
String commitTime3 = "003";
Set<String> batch3Buckets = this.insertPartitionRecordsWithCommit(client, batch3RecordsCount, commitTime3, DEFAULT_THIRD_PARTITION_PATH);
// delete DEFAULT_FIRST_PARTITION_PATH
String commitTime4 = "004";
Set<String> deletePartitionReplaceFileIds1 = deletePartitionWithCommit(client, commitTime4, Arrays.asList(DEFAULT_FIRST_PARTITION_PATH));
assertEquals(batch1Buckets, deletePartitionReplaceFileIds1);
List<HoodieBaseFile> baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_FIRST_PARTITION_PATH));
assertEquals(0, baseFiles.size());
baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_SECOND_PARTITION_PATH));
assertTrue(baseFiles.size() > 0);
baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_THIRD_PARTITION_PATH));
assertTrue(baseFiles.size() > 0);
// delete DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH
String commitTime5 = "005";
Set<String> deletePartitionReplaceFileIds2 = deletePartitionWithCommit(client, commitTime5, Arrays.asList(DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH));
Set<String> expectedFileId = new HashSet<>();
expectedFileId.addAll(batch2Buckets);
expectedFileId.addAll(batch3Buckets);
assertEquals(expectedFileId, deletePartitionReplaceFileIds2);
baseFiles = HoodieClientTestUtils.getLatestBaseFiles(basePath, fs, String.format("%s/%s/*", basePath, DEFAULT_FIRST_PARTITION_PATH), String.format("%s/%s/*", basePath, DEFAULT_SECOND_PARTITION_PATH), String.format("%s/%s/*", basePath, DEFAULT_THIRD_PARTITION_PATH));
assertEquals(0, baseFiles.size());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testPendingClusteringRollback.
@Test
public void testPendingClusteringRollback() throws Exception {
boolean populateMetaFields = true;
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).withInlineClustering(true).build();
// start clustering, but don't commit
List<HoodieRecord> allRecords = testInsertAndClustering(clusteringConfig, populateMetaFields, false);
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans = ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
assertEquals(1, pendingClusteringPlans.size());
HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft();
// complete another commit after pending clustering
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(EAGER);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig config = cfgBuilder.build();
SparkRDDWriteClient client = getHoodieWriteClient(config);
dataGen = new HoodieTestDataGenerator();
String commitTime = HoodieActiveTimeline.createNewInstantTime();
allRecords.addAll(dataGen.generateInserts(commitTime, 200));
assertThrows(HoodieUpsertException.class, () -> writeAndVerifyBatch(client, allRecords, commitTime, populateMetaFields));
// verify pending clustering can be rolled back (even though there is a completed commit greater than pending clustering)
client.rollback(pendingClusteringInstant.getTimestamp());
metaClient.reloadActiveTimeline();
// verify there are no pending clustering instants
assertEquals(0, ClusteringUtils.getAllPendingClusteringPlans(metaClient).count());
// delete rollback.completed instant to mimic failed rollback of clustering. and then trigger rollback of clustering again. same rollback instant should be used.
HoodieInstant rollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
FileCreateUtils.deleteRollbackCommit(metaClient.getBasePath(), rollbackInstant.getTimestamp());
metaClient.reloadActiveTimeline();
// create replace commit requested meta file so that rollback will not throw FileNotFoundException
// create file slice with instantTime 001 and build clustering plan including this created 001 file slice.
HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, pendingClusteringInstant.getTimestamp(), "1");
// create requested replace commit
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build();
FileCreateUtils.createRequestedReplaceCommit(metaClient.getBasePath(), pendingClusteringInstant.getTimestamp(), Option.of(requestedReplaceMetadata));
// trigger clustering again. no new rollback instants should be generated.
try {
client.cluster(pendingClusteringInstant.getTimestamp(), false);
// new replace commit metadata generated is fake one. so, clustering will fail. but the intention of test is ot check for duplicate rollback instants.
} catch (Exception e) {
// ignore.
}
metaClient.reloadActiveTimeline();
// verify that there is no new rollback instant generated
HoodieInstant newRollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
assertEquals(rollbackInstant.getTimestamp(), newRollbackInstant.getTimestamp());
}
Aggregations