use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class HoodieBackedTableMetadataWriter method prepRecords.
/**
* Tag each record with the location in the given partition.
* The record is tagged with respective file slice's location based on its record key.
*/
protected HoodieData<HoodieRecord> prepRecords(Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionRecordsMap) {
// The result set
HoodieData<HoodieRecord> allPartitionRecords = engineContext.emptyHoodieData();
HoodieTableFileSystemView fsView = HoodieTableMetadataUtil.getFileSystemView(metadataMetaClient);
for (Map.Entry<MetadataPartitionType, HoodieData<HoodieRecord>> entry : partitionRecordsMap.entrySet()) {
final String partitionName = entry.getKey().getPartitionPath();
final int fileGroupCount = entry.getKey().getFileGroupCount();
HoodieData<HoodieRecord> records = entry.getValue();
List<FileSlice> fileSlices = HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient, Option.ofNullable(fsView), partitionName);
ValidationUtils.checkArgument(fileSlices.size() == fileGroupCount, String.format("Invalid number of file groups for partition:%s, found=%d, required=%d", partitionName, fileSlices.size(), fileGroupCount));
HoodieData<HoodieRecord> rddSinglePartitionRecords = records.map(r -> {
FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), fileGroupCount));
r.setCurrentLocation(new HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId()));
return r;
});
allPartitionRecords = allPartitionRecords.union(rddSinglePartitionRecords);
}
return allPartitionRecords;
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class HoodieBucketIndex method tagLocation.
@Override
public <R> HoodieData<HoodieRecord<R>> tagLocation(HoodieData<HoodieRecord<R>> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException {
HoodieData<HoodieRecord<R>> taggedRecords = records.mapPartitions(recordIter -> {
// partitionPath -> bucketId -> fileInfo
Map<String, Map<Integer, Pair<String, String>>> partitionPathFileIDList = new HashMap<>();
return new LazyIterableIterator<HoodieRecord<R>, HoodieRecord<R>>(recordIter) {
@Override
protected void start() {
}
@Override
protected HoodieRecord<R> computeNext() {
HoodieRecord record = recordIter.next();
int bucketId = BucketIdentifier.getBucketId(record, config.getBucketIndexHashField(), numBuckets);
String partitionPath = record.getPartitionPath();
if (!partitionPathFileIDList.containsKey(partitionPath)) {
partitionPathFileIDList.put(partitionPath, loadPartitionBucketIdFileIdMapping(hoodieTable, partitionPath));
}
if (partitionPathFileIDList.get(partitionPath).containsKey(bucketId)) {
Pair<String, String> fileInfo = partitionPathFileIDList.get(partitionPath).get(bucketId);
return HoodieIndexUtils.getTaggedRecord(record, Option.of(new HoodieRecordLocation(fileInfo.getRight(), fileInfo.getLeft())));
}
return record;
}
@Override
protected void end() {
}
};
}, true);
return taggedRecords;
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class HoodieGlobalSimpleIndex method getTaggedRecords.
/**
* Tag records with right {@link HoodieRecordLocation}.
*
* @param incomingRecords incoming {@link HoodieRecord}s
* @param existingRecords existing records with {@link HoodieRecordLocation}s
* @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
*/
private <R> HoodieData<HoodieRecord<R>> getTaggedRecords(HoodiePairData<String, HoodieRecord<R>> incomingRecords, HoodiePairData<HoodieKey, HoodieRecordLocation> existingRecords) {
HoodiePairData<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords.mapToPair(entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), Pair.of(entry.getLeft().getPartitionPath(), entry.getRight())));
return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values().flatMap(entry -> {
HoodieRecord<R> inputRecord = entry.getLeft();
Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null));
List<HoodieRecord<R>> taggedRecords;
if (partitionPathLocationPair.isPresent()) {
String partitionPath = partitionPathLocationPair.get().getKey();
HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
// Create an empty record to delete the record in the old partition
HoodieRecord<R> deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
deleteRecord.setCurrentLocation(location);
deleteRecord.seal();
// Tag the incoming record for inserting to the new partition
HoodieRecord<R> insertRecord = (HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
taggedRecords = Arrays.asList(deleteRecord, insertRecord);
} else {
// Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
// When it differs, the record will still be updated at its old partition.
HoodieRecord<R> newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData());
taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
}
} else {
taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
}
return taggedRecords.iterator();
});
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class TestHoodieKeyLocationFetchHandle method writeToParquetAndGetExpectedRecordLocations.
private Map<Tuple2<String, String>, List<Tuple2<HoodieKey, HoodieRecordLocation>>> writeToParquetAndGetExpectedRecordLocations(Map<String, List<HoodieRecord>> partitionRecordsMap, HoodieSparkWriteableTestTable testTable) throws Exception {
Map<Tuple2<String, String>, List<Tuple2<HoodieKey, HoodieRecordLocation>>> expectedList = new HashMap<>();
for (Map.Entry<String, List<HoodieRecord>> entry : partitionRecordsMap.entrySet()) {
int totalRecordsPerPartition = entry.getValue().size();
int totalSlices = 1;
if (totalRecordsPerPartition > 5) {
totalSlices = totalRecordsPerPartition / 3;
}
int recordsPerFileSlice = totalRecordsPerPartition / totalSlices;
List<List<HoodieRecord>> recordsForFileSlices = new ArrayList<>();
recordsForFileSlices.add(new ArrayList<>());
int index = 0;
int count = 0;
for (HoodieRecord record : entry.getValue()) {
if (count < recordsPerFileSlice) {
recordsForFileSlices.get(index).add(record);
count++;
} else {
recordsForFileSlices.add(new ArrayList<>());
index++;
count = 0;
}
}
for (List<HoodieRecord> recordsPerSlice : recordsForFileSlices) {
String instantTime = makeNewCommitTime();
String fileId = testTable.addCommit(instantTime).getFileIdWithInserts(entry.getKey(), recordsPerSlice.toArray(new HoodieRecord[0]));
Tuple2<String, String> fileIdInstantTimePair = new Tuple2<>(fileId, instantTime);
List<Tuple2<HoodieKey, HoodieRecordLocation>> expectedEntries = new ArrayList<>();
for (HoodieRecord record : recordsPerSlice) {
expectedEntries.add(new Tuple2<>(record.getKey(), new HoodieRecordLocation(fileIdInstantTimePair._2, fileIdInstantTimePair._1)));
}
expectedList.put(new Tuple2<>(entry.getKey(), fileIdInstantTimePair._1), expectedEntries);
}
}
return expectedList;
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class TestUpdateSchemaEvolution method buildUpdateRecords.
private List<HoodieRecord> buildUpdateRecords(String recordStr, String insertFileId) throws IOException {
List<HoodieRecord> updateRecords = new ArrayList<>();
RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
HoodieRecord record = new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange);
record.setCurrentLocation(new HoodieRecordLocation("101", insertFileId));
record.seal();
updateRecords.add(record);
return updateRecords;
}
Aggregations