use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestJavaCopyOnWriteActionExecutor method testUpdateRecords.
@Test
public void testUpdateRecords() throws Exception {
// Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfig();
int startInstant = 1;
String firstCommitTime = makeNewCommitTime(startInstant++);
HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
writeClient.startCommitWithTime(firstCommitTime);
metaClient = HoodieTableMetaClient.reload(metaClient);
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
String partitionPath = "2016/01/31";
// Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
List<HoodieRecord> records = new ArrayList<>();
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
// Insert new records
writeClient.insert(records, firstCommitTime);
FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
assertEquals(1, allFiles.length);
// Read out the bloom filter and make sure filter can answer record exist or not
Path filePath = allFiles[0].getPath();
BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
for (HoodieRecord record : records) {
assertTrue(filter.mightContain(record.getRecordKey()));
}
// Read the base file, check the record content
List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
GenericRecord newRecord;
int index = 0;
for (GenericRecord record : fileRecords) {
// System.out.println("Got :" + record.get("_row_key").toString() + ", Exp :" + records.get(index).getRecordKey());
assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
index++;
}
// We update the 1st record & add a new record
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1);
HoodieRecord updatedRecord1 = new HoodieAvroRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
HoodieRecord insertedRecord1 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
String newCommitTime = makeNewCommitTime(startInstant++);
metaClient = HoodieTableMetaClient.reload(metaClient);
writeClient.startCommitWithTime(newCommitTime);
List<WriteStatus> statuses = writeClient.upsert(updatedRecords, newCommitTime);
allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
assertEquals(1, allFiles.length);
// verify new incremental file group is same as the previous one
assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
// Check whether the record has been updated
Path updatedFilePath = allFiles[0].getPath();
BloomFilter updatedFilter = fileUtils.readBloomFilterFromMetadata(hadoopConf, updatedFilePath);
for (HoodieRecord record : records) {
// No change to the _row_key
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
}
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
// add this so it can further check below
records.add(insertedRecord1);
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build();
index = 0;
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey());
if (index == 0) {
assertEquals("15", newRecord.get("number").toString());
}
index++;
}
updatedReader.close();
// Also check the numRecordsWritten
WriteStatus writeStatus = statuses.get(0);
assertEquals(1, statuses.size(), "Should be only one file generated");
// 3 rewritten records + 1 new record
assertEquals(4, writeStatus.getStat().getNumWrites());
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class TestJavaCopyOnWriteActionExecutor method testMetadataAggregateFromWriteStatus.
// Check if record level metadata is aggregated properly at the end of write.
@Test
public void testMetadataAggregateFromWriteStatus() throws Exception {
// Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build();
String firstCommitTime = makeNewCommitTime();
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
// Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
List<HoodieRecord> records = new ArrayList<>();
RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
// Insert new records
BaseJavaCommitActionExecutor actionExecutor = new JavaInsertCommitActionExecutor(context, config, table, firstCommitTime, records);
List<WriteStatus> writeStatuses = new ArrayList<>();
actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator()).forEachRemaining(x -> writeStatuses.addAll((List<WriteStatus>) x));
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus.mergeMetadataForWriteStatuses(writeStatuses);
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
// should be 2 * 3
assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class FlinkWriteHelper method deduplicateRecords.
@Override
public List<HoodieRecord<T>> deduplicateRecords(List<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
Map<Object, List<Pair<Object, HoodieRecord<T>>>> keyedRecords = records.stream().map(record -> {
// If index used is global, then records are expected to differ in their partitionPath
final Object key = record.getKey().getRecordKey();
return Pair.of(key, record);
}).collect(Collectors.groupingBy(Pair::getLeft));
return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> {
final T data1 = rec1.getData();
final T data2 = rec2.getData();
@SuppressWarnings("unchecked") final T reducedData = (T) data2.preCombine(data1);
// we cannot allow the user to change the key or partitionPath, since that will affect
// everything
// so pick it from one of the records.
boolean choosePrev = data1.equals(reducedData);
HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey();
HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation();
HoodieRecord<T> hoodieRecord = new HoodieAvroRecord<>(reducedKey, reducedData, operation);
// reuse the location from the first record.
hoodieRecord.setCurrentLocation(rec1.getCurrentLocation());
return hoodieRecord;
}).orElse(null)).filter(Objects::nonNull).collect(Collectors.toList());
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class HoodieMetadataPayload method createPartitionFilesRecord.
/**
* Create and return a {@code HoodieMetadataPayload} to save list of files within a partition.
*
* @param partition The name of the partition
* @param filesAdded Mapping of files to their sizes for files which have been added to this partition
* @param filesDeleted List of files which have been deleted from this partition
*/
public static HoodieRecord<HoodieMetadataPayload> createPartitionFilesRecord(String partition, Option<Map<String, Long>> filesAdded, Option<List<String>> filesDeleted) {
Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>();
filesAdded.ifPresent(filesMap -> fileInfo.putAll(filesMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, (entry) -> {
long fileSize = entry.getValue();
// Assert that the file-size of the file being added is positive, since Hudi
// should not be creating empty files
checkState(fileSize > 0);
return new HoodieMetadataFileInfo(fileSize, false);
}))));
filesDeleted.ifPresent(filesList -> fileInfo.putAll(filesList.stream().collect(Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true)))));
HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath());
HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
return new HoodieAvroRecord<>(key, payload);
}
use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.
the class HoodieMetadataPayload method createBloomFilterMetadataRecord.
/**
* Create bloom filter metadata record.
*
* @param partitionName - Partition name
* @param baseFileName - Base file name for which the bloom filter needs to persisted
* @param timestamp - Instant timestamp responsible for this record
* @param bloomFilter - Bloom filter for the File
* @param isDeleted - Is the bloom filter no more valid
* @return Metadata payload containing the fileID and its bloom filter record
*/
public static HoodieRecord<HoodieMetadataPayload> createBloomFilterMetadataRecord(final String partitionName, final String baseFileName, final String timestamp, final String bloomFilterType, final ByteBuffer bloomFilter, final boolean isDeleted) {
checkArgument(!baseFileName.contains(Path.SEPARATOR) && FSUtils.isBaseFile(new Path(baseFileName)), "Invalid base file '" + baseFileName + "' for MetaIndexBloomFilter!");
final String bloomFilterIndexKey = new PartitionIndexID(partitionName).asBase64EncodedString().concat(new FileIndexID(baseFileName).asBase64EncodedString());
HoodieKey key = new HoodieKey(bloomFilterIndexKey, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath());
HoodieMetadataBloomFilter metadataBloomFilter = new HoodieMetadataBloomFilter(bloomFilterType, timestamp, bloomFilter, isDeleted);
HoodieMetadataPayload metadataPayload = new HoodieMetadataPayload(key.getRecordKey(), metadataBloomFilter);
return new HoodieAvroRecord<>(key, metadataPayload);
}
Aggregations