Search in sources :

Example 26 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieBackedTableMetadataWriter method prepRecords.

/**
 * Tag each record with the location in the given partition.
 * The record is tagged with respective file slice's location based on its record key.
 */
protected HoodieData<HoodieRecord> prepRecords(Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionRecordsMap) {
    // The result set
    HoodieData<HoodieRecord> allPartitionRecords = engineContext.emptyHoodieData();
    HoodieTableFileSystemView fsView = HoodieTableMetadataUtil.getFileSystemView(metadataMetaClient);
    for (Map.Entry<MetadataPartitionType, HoodieData<HoodieRecord>> entry : partitionRecordsMap.entrySet()) {
        final String partitionName = entry.getKey().getPartitionPath();
        final int fileGroupCount = entry.getKey().getFileGroupCount();
        HoodieData<HoodieRecord> records = entry.getValue();
        List<FileSlice> fileSlices = HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient, Option.ofNullable(fsView), partitionName);
        ValidationUtils.checkArgument(fileSlices.size() == fileGroupCount, String.format("Invalid number of file groups for partition:%s, found=%d, required=%d", partitionName, fileSlices.size(), fileGroupCount));
        HoodieData<HoodieRecord> rddSinglePartitionRecords = records.map(r -> {
            FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), fileGroupCount));
            r.setCurrentLocation(new HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId()));
            return r;
        });
        allPartitionRecords = allPartitionRecords.union(rddSinglePartitionRecords);
    }
    return allPartitionRecords;
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Map(java.util.Map) HashMap(java.util.HashMap)

Example 27 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieBucketIndex method tagLocation.

@Override
public <R> HoodieData<HoodieRecord<R>> tagLocation(HoodieData<HoodieRecord<R>> records, HoodieEngineContext context, HoodieTable hoodieTable) throws HoodieIndexException {
    HoodieData<HoodieRecord<R>> taggedRecords = records.mapPartitions(recordIter -> {
        // partitionPath -> bucketId -> fileInfo
        Map<String, Map<Integer, Pair<String, String>>> partitionPathFileIDList = new HashMap<>();
        return new LazyIterableIterator<HoodieRecord<R>, HoodieRecord<R>>(recordIter) {

            @Override
            protected void start() {
            }

            @Override
            protected HoodieRecord<R> computeNext() {
                HoodieRecord record = recordIter.next();
                int bucketId = BucketIdentifier.getBucketId(record, config.getBucketIndexHashField(), numBuckets);
                String partitionPath = record.getPartitionPath();
                if (!partitionPathFileIDList.containsKey(partitionPath)) {
                    partitionPathFileIDList.put(partitionPath, loadPartitionBucketIdFileIdMapping(hoodieTable, partitionPath));
                }
                if (partitionPathFileIDList.get(partitionPath).containsKey(bucketId)) {
                    Pair<String, String> fileInfo = partitionPathFileIDList.get(partitionPath).get(bucketId);
                    return HoodieIndexUtils.getTaggedRecord(record, Option.of(new HoodieRecordLocation(fileInfo.getRight(), fileInfo.getLeft())));
                }
                return record;
            }

            @Override
            protected void end() {
            }
        };
    }, true);
    return taggedRecords;
}
Also used : LazyIterableIterator(org.apache.hudi.client.utils.LazyIterableIterator) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HashMap(java.util.HashMap) Map(java.util.Map)

Example 28 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieGlobalSimpleIndex method getTaggedRecords.

/**
 * Tag records with right {@link HoodieRecordLocation}.
 *
 * @param incomingRecords incoming {@link HoodieRecord}s
 * @param existingRecords existing records with {@link HoodieRecordLocation}s
 * @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
 */
private <R> HoodieData<HoodieRecord<R>> getTaggedRecords(HoodiePairData<String, HoodieRecord<R>> incomingRecords, HoodiePairData<HoodieKey, HoodieRecordLocation> existingRecords) {
    HoodiePairData<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords.mapToPair(entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), Pair.of(entry.getLeft().getPartitionPath(), entry.getRight())));
    return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values().flatMap(entry -> {
        HoodieRecord<R> inputRecord = entry.getLeft();
        Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null));
        List<HoodieRecord<R>> taggedRecords;
        if (partitionPathLocationPair.isPresent()) {
            String partitionPath = partitionPathLocationPair.get().getKey();
            HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
            if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
                // Create an empty record to delete the record in the old partition
                HoodieRecord<R> deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
                deleteRecord.setCurrentLocation(location);
                deleteRecord.seal();
                // Tag the incoming record for inserting to the new partition
                HoodieRecord<R> insertRecord = (HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
                taggedRecords = Arrays.asList(deleteRecord, insertRecord);
            } else {
                // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
                // When it differs, the record will still be updated at its old partition.
                HoodieRecord<R> newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData());
                taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
            }
        } else {
            taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
        }
        return taggedRecords.iterator();
    });
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 29 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestHoodieKeyLocationFetchHandle method writeToParquetAndGetExpectedRecordLocations.

private Map<Tuple2<String, String>, List<Tuple2<HoodieKey, HoodieRecordLocation>>> writeToParquetAndGetExpectedRecordLocations(Map<String, List<HoodieRecord>> partitionRecordsMap, HoodieSparkWriteableTestTable testTable) throws Exception {
    Map<Tuple2<String, String>, List<Tuple2<HoodieKey, HoodieRecordLocation>>> expectedList = new HashMap<>();
    for (Map.Entry<String, List<HoodieRecord>> entry : partitionRecordsMap.entrySet()) {
        int totalRecordsPerPartition = entry.getValue().size();
        int totalSlices = 1;
        if (totalRecordsPerPartition > 5) {
            totalSlices = totalRecordsPerPartition / 3;
        }
        int recordsPerFileSlice = totalRecordsPerPartition / totalSlices;
        List<List<HoodieRecord>> recordsForFileSlices = new ArrayList<>();
        recordsForFileSlices.add(new ArrayList<>());
        int index = 0;
        int count = 0;
        for (HoodieRecord record : entry.getValue()) {
            if (count < recordsPerFileSlice) {
                recordsForFileSlices.get(index).add(record);
                count++;
            } else {
                recordsForFileSlices.add(new ArrayList<>());
                index++;
                count = 0;
            }
        }
        for (List<HoodieRecord> recordsPerSlice : recordsForFileSlices) {
            String instantTime = makeNewCommitTime();
            String fileId = testTable.addCommit(instantTime).getFileIdWithInserts(entry.getKey(), recordsPerSlice.toArray(new HoodieRecord[0]));
            Tuple2<String, String> fileIdInstantTimePair = new Tuple2<>(fileId, instantTime);
            List<Tuple2<HoodieKey, HoodieRecordLocation>> expectedEntries = new ArrayList<>();
            for (HoodieRecord record : recordsPerSlice) {
                expectedEntries.add(new Tuple2<>(record.getKey(), new HoodieRecordLocation(fileIdInstantTimePair._2, fileIdInstantTimePair._1)));
            }
            expectedList.put(new Tuple2<>(entry.getKey(), fileIdInstantTimePair._1), expectedEntries);
        }
    }
    return expectedList;
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Tuple2(scala.Tuple2) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HashMap(java.util.HashMap) Map(java.util.Map) Transformations.recordsToPartitionRecordsMap(org.apache.hudi.common.testutils.Transformations.recordsToPartitionRecordsMap)

Example 30 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestUpdateSchemaEvolution method buildUpdateRecords.

private List<HoodieRecord> buildUpdateRecords(String recordStr, String insertFileId) throws IOException {
    List<HoodieRecord> updateRecords = new ArrayList<>();
    RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
    HoodieRecord record = new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange);
    record.setCurrentLocation(new HoodieRecordLocation("101", insertFileId));
    record.seal();
    updateRecords.add(record);
    return updateRecords;
}
Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6