Search in sources :

Example 21 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class UpsertPartitioner method assignUpdates.

private void assignUpdates(WorkloadProfile profile) {
    // each update location gets a partition
    Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet();
    for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
        WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat());
        for (Map.Entry<String, Pair<String, Long>> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
            addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
            if (profile.hasOutputWorkLoadStats()) {
                HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey());
                outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue());
            }
        }
        if (profile.hasOutputWorkLoadStats()) {
            profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats);
        }
    }
}
Also used : Entry(java.util.Map.Entry) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HashMap(java.util.HashMap) Map(java.util.Map) Pair(org.apache.hudi.common.util.collection.Pair)

Example 22 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class UpsertPartitioner method assignInserts.

private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) {
    // for new inserts, compute buckets depending on how many records we have for each partition
    Set<String> partitionPaths = profile.getPartitionPaths();
    long averageRecordSize = averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(), config);
    LOG.info("AvgRecordSize => " + averageRecordSize);
    Map<String, List<SmallFile>> partitionSmallFilesMap = getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), context);
    Map<String, Set<String>> partitionPathToPendingClusteringFileGroupsId = getPartitionPathToPendingClusteringFileGroupsId();
    for (String partitionPath : partitionPaths) {
        WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
        WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat());
        if (pStat.getNumInserts() > 0) {
            List<SmallFile> smallFiles = filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()), partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()));
            this.smallFiles.addAll(smallFiles);
            LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
            long totalUnassignedInserts = pStat.getNumInserts();
            List<Integer> bucketNumbers = new ArrayList<>();
            List<Long> recordsPerBucket = new ArrayList<>();
            // first try packing this into one of the smallFiles
            for (SmallFile smallFile : smallFiles) {
                long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, totalUnassignedInserts);
                if (recordsToAppend > 0) {
                    // create a new bucket or re-use an existing bucket
                    int bucket;
                    if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
                        bucket = updateLocationToBucket.get(smallFile.location.getFileId());
                        LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
                    } else {
                        bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
                        LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
                    }
                    if (profile.hasOutputWorkLoadStats()) {
                        outputWorkloadStats.addInserts(smallFile.location, recordsToAppend);
                    }
                    bucketNumbers.add(bucket);
                    recordsPerBucket.add(recordsToAppend);
                    totalUnassignedInserts -= recordsToAppend;
                    if (totalUnassignedInserts <= 0) {
                        // stop the loop when all the inserts are assigned
                        break;
                    }
                }
            }
            // if we have anything more, create new insert buckets, like normal
            if (totalUnassignedInserts > 0) {
                long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
                if (config.shouldAutoTuneInsertSplits()) {
                    insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
                }
                int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
                LOG.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket);
                for (int b = 0; b < insertBuckets; b++) {
                    bucketNumbers.add(totalBuckets);
                    if (b < insertBuckets - 1) {
                        recordsPerBucket.add(insertRecordsPerBucket);
                    } else {
                        recordsPerBucket.add(totalUnassignedInserts - (insertBuckets - 1) * insertRecordsPerBucket);
                    }
                    BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath);
                    bucketInfoMap.put(totalBuckets, bucketInfo);
                    if (profile.hasOutputWorkLoadStats()) {
                        outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1));
                    }
                    totalBuckets++;
                }
            }
            // Go over all such buckets, and assign weights as per amount of incoming inserts.
            List<InsertBucketCumulativeWeightPair> insertBuckets = new ArrayList<>();
            double currentCumulativeWeight = 0;
            for (int i = 0; i < bucketNumbers.size(); i++) {
                InsertBucket bkt = new InsertBucket();
                bkt.bucketNumber = bucketNumbers.get(i);
                bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
                currentCumulativeWeight += bkt.weight;
                insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, currentCumulativeWeight));
            }
            LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
            partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets);
        }
        if (profile.hasOutputWorkLoadStats()) {
            profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats);
        }
    }
}
Also used : Set(java.util.Set) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) WorkloadStat(org.apache.hudi.table.WorkloadStat) ArrayList(java.util.ArrayList) List(java.util.List)

Example 23 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class AbstractConnectWriter method writeRecord.

@Override
public void writeRecord(SinkRecord record) throws IOException {
    AvroConvertor convertor = new AvroConvertor(schemaProvider.getSourceSchema());
    Option<GenericRecord> avroRecord;
    switch(connectConfigs.getKafkaValueConverter()) {
        case KAFKA_AVRO_CONVERTER:
            avroRecord = Option.of((GenericRecord) record.value());
            break;
        case KAFKA_STRING_CONVERTER:
            avroRecord = Option.of(convertor.fromJson((String) record.value()));
            break;
        case KAFKA_JSON_CONVERTER:
            throw new UnsupportedEncodingException("Currently JSON objects are not supported");
        default:
            throw new IOException("Unsupported Kafka Format type (" + connectConfigs.getKafkaValueConverter() + ")");
    }
    // Tag records with a file ID based on kafka partition and hudi partition.
    HoodieRecord<?> hoodieRecord = new HoodieAvroRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord));
    String fileId = KafkaConnectUtils.hashDigest(String.format("%s-%s", record.kafkaPartition(), hoodieRecord.getPartitionPath()));
    hoodieRecord.unseal();
    hoodieRecord.setCurrentLocation(new HoodieRecordLocation(instantTime, fileId));
    hoodieRecord.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
    hoodieRecord.seal();
    writeHudiRecord(hoodieRecord);
}
Also used : AvroConvertor(org.apache.hudi.utilities.sources.helpers.AvroConvertor) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) UnsupportedEncodingException(java.io.UnsupportedEncodingException) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 24 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieWriteableTestTable method appendRecordsToLogFile.

private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
    String partitionPath = groupedRecords.get(0).getPartitionPath();
    HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
    try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()).overBaseCommit(location.getInstantTime()).withFs(fs).build()) {
        Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
        header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
        logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
            try {
                GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
                HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
                return (IndexedRecord) val;
            } catch (IOException e) {
                LOG.warn("Failed to convert record " + r.toString(), e);
                return null;
            }
        }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
        return Pair.of(partitionPath, logWriter.getLogFile());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieAvroParquetConfig(org.apache.hudi.io.storage.HoodieAvroParquetConfig) FileCreateUtils.baseFileName(org.apache.hudi.common.testutils.FileCreateUtils.baseFileName) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieParquetWriter(org.apache.hudi.io.storage.HoodieParquetWriter) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) CompressionKind(org.apache.orc.CompressionKind) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) HoodieOrcWriter(org.apache.hudi.io.storage.HoodieOrcWriter) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieOrcConfig(org.apache.hudi.io.storage.HoodieOrcConfig) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Paths(java.nio.file.Paths) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) LogManager(org.apache.log4j.LogManager) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 25 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class BucketStreamWriteFunction method processElement.

@Override
public void processElement(I i, ProcessFunction<I, Object>.Context context, Collector<Object> collector) throws Exception {
    HoodieRecord<?> record = (HoodieRecord<?>) i;
    final HoodieKey hoodieKey = record.getKey();
    final HoodieRecordLocation location;
    final int bucketNum = BucketIdentifier.getBucketId(hoodieKey, indexKeyFields, this.bucketNum);
    final String partitionBucketId = BucketIdentifier.partitionBucketIdStr(hoodieKey.getPartitionPath(), bucketNum);
    if (bucketToFileIDMap.containsKey(partitionBucketId)) {
        location = new HoodieRecordLocation("U", bucketToFileIDMap.get(partitionBucketId));
    } else {
        String newFileId = BucketIdentifier.newBucketFileIdPrefix(bucketNum);
        location = new HoodieRecordLocation("I", newFileId);
        bucketToFileIDMap.put(partitionBucketId, newFileId);
    }
    record.unseal();
    record.setCurrentLocation(location);
    record.seal();
    bufferRecord(record);
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6