use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.
the class TestCopyOnWriteActionExecutor method testInsertUpsertWithHoodieAvroPayload.
@Test
public void testInsertUpsertWithHoodieAvroPayload() throws Exception {
Schema schema = getSchemaFromResource(TestCopyOnWriteActionExecutor.class, "/testDataGeneratorSchema.txt");
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schema.toString()).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withRemoteServerPort(timelineServicePort).build()).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1000 * 1024).hfileMaxFileSize(1000 * 1024).build()).build();
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
String instantTime = "000";
// Perform inserts of 100 records to test CreateHandle and BufferedExecutor
final List<HoodieRecord> inserts = dataGen.generateInsertsWithHoodieAvroPayload(instantTime, 100);
BaseSparkCommitActionExecutor actionExecutor = new SparkInsertCommitActionExecutor(context, config, table, instantTime, context.parallelize(inserts));
final List<List<WriteStatus>> ws = jsc.parallelize(Arrays.asList(1)).map(x -> {
return actionExecutor.handleInsert(UUID.randomUUID().toString(), inserts.iterator());
}).map(Transformations::flatten).collect();
WriteStatus writeStatus = ws.get(0).get(0);
String fileId = writeStatus.getFileId();
metaClient.getFs().create(new Path(Paths.get(basePath, ".hoodie", "000.commit").toString())).close();
final List<HoodieRecord> updates = dataGen.generateUpdatesWithHoodieAvroPayload(instantTime, inserts);
String partitionPath = writeStatus.getPartitionPath();
long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count();
table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, HoodieTableMetaClient.reload(metaClient));
BaseSparkCommitActionExecutor newActionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, instantTime, context.parallelize(updates));
final List<List<WriteStatus>> updateStatus = jsc.parallelize(Arrays.asList(1)).map(x -> {
return newActionExecutor.handleUpdate(partitionPath, fileId, updates.iterator());
}).map(Transformations::flatten).collect();
assertEquals(updates.size() - numRecordsInPartition, updateStatus.get(0).get(0).getTotalErrorRecords());
}
use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.
the class TestCopyOnWriteActionExecutor method testUpdateRecords.
// TODO (weiy): Add testcases for crossing file writing.
@ParameterizedTest
@MethodSource("indexType")
public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception {
// Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withProps(makeIndexConfig(indexType)).build();
String firstCommitTime = makeNewCommitTime();
SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
writeClient.startCommitWithTime(firstCommitTime);
metaClient = HoodieTableMetaClient.reload(metaClient);
String partitionPath = "2016/01/31";
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
// Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
List<HoodieRecord> records = new ArrayList<>();
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
// Insert new records
final HoodieSparkCopyOnWriteTable cowTable = table;
writeClient.insert(jsc.parallelize(records, 1), firstCommitTime);
FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
assertEquals(1, allFiles.length);
// Read out the bloom filter and make sure filter can answer record exist or not
Path filePath = allFiles[0].getPath();
BloomFilter filter = BaseFileUtils.getInstance(table.getBaseFileFormat()).readBloomFilterFromMetadata(hadoopConf, filePath);
for (HoodieRecord record : records) {
assertTrue(filter.mightContain(record.getRecordKey()));
}
// Read the base file, check the record content
List<GenericRecord> fileRecords = BaseFileUtils.getInstance(table.getBaseFileFormat()).readAvroRecords(hadoopConf, filePath);
GenericRecord newRecord;
int index = 0;
for (GenericRecord record : fileRecords) {
assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
index++;
}
// We update the 1st record & add a new record
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1);
HoodieRecord updatedRecord1 = new HoodieAvroRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord insertedRecord1 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
Thread.sleep(1000);
String newCommitTime = makeNewCommitTime();
metaClient = HoodieTableMetaClient.reload(metaClient);
writeClient.startCommitWithTime(newCommitTime);
List<WriteStatus> statuses = writeClient.upsert(jsc.parallelize(updatedRecords), newCommitTime).collect();
allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
assertEquals(1, allFiles.length);
// verify new incremental file group is same as the previous one
assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
// Check whether the record has been updated
Path updatedFilePath = allFiles[0].getPath();
BloomFilter updatedFilter = BaseFileUtils.getInstance(metaClient).readBloomFilterFromMetadata(hadoopConf, updatedFilePath);
for (HoodieRecord record : records) {
// No change to the _row_key
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
}
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
// add this so it can further check below
records.add(insertedRecord1);
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build();
index = 0;
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey());
if (index == 0) {
assertEquals("15", newRecord.get("number").toString());
}
index++;
}
updatedReader.close();
// Also check the numRecordsWritten
WriteStatus writeStatus = statuses.get(0);
assertEquals(1, statuses.size(), "Should be only one file generated");
// 3 rewritten records + 1 new record
assertEquals(4, writeStatus.getStat().getNumWrites());
}
use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.
the class TestUpsertPartitioner method testPartitionWeight.
@Test
public void testPartitionWeight() throws Exception {
final String testPartitionPath = "2016/09/26";
int totalInsertNum = 2000;
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(0).insertSplitSize(totalInsertNum / 2).autoTuneInsertSplits(false).build()).build();
FileCreateUtils.createCommit(basePath, "001");
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", totalInsertNum);
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
List<InsertBucketCumulativeWeightPair> insertBuckets = partitioner.getInsertBuckets(testPartitionPath);
float bucket0Weight = 0.2f;
InsertBucketCumulativeWeightPair pair = insertBuckets.remove(0);
pair.getKey().weight = bucket0Weight;
pair.setValue(new Double(bucket0Weight));
insertBuckets.add(0, pair);
InsertBucketCumulativeWeightPair pair1 = insertBuckets.remove(1);
pair1.getKey().weight = 1 - bucket0Weight;
pair1.setValue(new Double(1));
insertBuckets.add(1, pair1);
Map<Integer, Integer> partition2numRecords = new HashMap<Integer, Integer>();
for (HoodieRecord hoodieRecord : insertRecords) {
int partition = partitioner.getPartition(new Tuple2<>(hoodieRecord.getKey(), Option.ofNullable(hoodieRecord.getCurrentLocation())));
if (!partition2numRecords.containsKey(partition)) {
partition2numRecords.put(partition, 0);
}
partition2numRecords.put(partition, partition2numRecords.get(partition) + 1);
}
assertTrue(partition2numRecords.get(0) < partition2numRecords.get(1), "The insert num of bucket1 should more than bucket0");
assertTrue(partition2numRecords.get(0) + partition2numRecords.get(1) == totalInsertNum, "The total insert records should be " + totalInsertNum);
assertEquals(String.valueOf(bucket0Weight), String.format("%.1f", (partition2numRecords.get(0) * 1.0f / totalInsertNum)), "The weight of bucket0 should be " + bucket0Weight);
assertEquals(String.valueOf(1 - bucket0Weight), String.format("%.1f", (partition2numRecords.get(1) * 1.0f / totalInsertNum)), "The weight of bucket1 should be " + (1 - bucket0Weight));
}
use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.
the class TestUpsertPartitioner method getUpsertPartitioner.
private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).orcMaxFileSize(1000 * 1024).build()).build();
FileCreateUtils.createCommit(basePath, "001");
FileCreateUtils.createBaseFile(basePath, testPartitionPath, "001", "file1", fileSize);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
for (HoodieRecord updateRec : updateRecords) {
updateRec.unseal();
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
updateRec.seal();
}
List<HoodieRecord> records = new ArrayList<>();
records.addAll(insertRecords);
records.addAll(updateRecords);
WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(records)));
UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
assertEquals(0, partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.ofNullable(updateRecords.get(0).getCurrentLocation()))), "Update record should have gone to the 1 update partition");
return partitioner;
}
use of org.apache.hudi.table.HoodieSparkCopyOnWriteTable in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpdateRejectForClustering.
@Test
public void testUpdateRejectForClustering() throws IOException {
final String testPartitionPath = "2016/09/26";
dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
Properties props = new Properties();
props.setProperty(ASYNC_CLUSTERING_ENABLE.key(), "true");
HoodieWriteConfig config = getSmallInsertWriteConfig(100, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
SparkRDDWriteClient client = getHoodieWriteClient(config);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
// 1. insert to generate 2 file group
String commitTime1 = "001";
Pair<List<WriteStatus>, List<HoodieRecord>> upsertResult = insertBatchRecords(client, commitTime1, 600, 2);
List<HoodieRecord> inserts1 = upsertResult.getValue();
List<String> fileGroupIds1 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(2, fileGroupIds1.size());
// 2. generate clustering plan for fileGroupIds1 file groups
String commitTime2 = "002";
List<List<FileSlice>> firstInsertFileSlicesList = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getAllFileSlices().collect(Collectors.toList())).collect(Collectors.toList());
List<FileSlice>[] fileSlices = (List<FileSlice>[]) firstInsertFileSlicesList.toArray(new List[firstInsertFileSlicesList.size()]);
createRequestedReplaceInstant(this.metaClient, commitTime2, fileSlices);
// 3. insert one record with no updating reject exception, and not merge the small file, just generate a new file group
String commitTime3 = "003";
insertBatchRecords(client, commitTime3, 1, 1).getKey();
List<String> fileGroupIds2 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, fileGroupIds2.size());
// 4. update one record for the clustering two file groups, throw reject update exception
String commitTime4 = "004";
client.startCommitWithTime(commitTime4);
List<HoodieRecord> insertsAndUpdates3 = new ArrayList<>();
insertsAndUpdates3.addAll(dataGen.generateUpdates(commitTime4, inserts1));
String assertMsg = String.format("Not allowed to update the clustering files in partition: %s " + "For pending clustering operations, we are not going to support update for now.", testPartitionPath);
assertThrows(HoodieUpsertException.class, () -> {
writeClient.upsert(jsc.parallelize(insertsAndUpdates3, 1), commitTime3).collect();
}, assertMsg);
// 5. insert one record with no updating reject exception, will merge the small file
String commitTime5 = "005";
List<WriteStatus> statuses = insertBatchRecords(client, commitTime5, 1, 1).getKey();
fileGroupIds2.removeAll(fileGroupIds1);
assertEquals(fileGroupIds2.get(0), statuses.get(0).getFileId());
List<String> firstInsertFileGroupIds4 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, firstInsertFileGroupIds4.size());
}
Aggregations