Search in sources :

Example 76 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieMetadataPayload method createPartitionListRecord.

/**
 * Create and return a {@code HoodieMetadataPayload} to save list of partitions.
 *
 * @param partitions The list of partitions
 */
public static HoodieRecord<HoodieMetadataPayload> createPartitionListRecord(List<String> partitions) {
    Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>();
    partitions.forEach(partition -> fileInfo.put(partition, new HoodieMetadataFileInfo(0L, false)));
    HoodieKey key = new HoodieKey(RECORDKEY_PARTITION_LIST, MetadataPartitionType.FILES.getPartitionPath());
    HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_PARTITION_LIST, fileInfo);
    return new HoodieAvroRecord<>(key, payload);
}
Also used : HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HashMap(java.util.HashMap) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieMetadataFileInfo(org.apache.hudi.avro.model.HoodieMetadataFileInfo)

Example 77 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class HoodieBackedTableMetadata method readFromBaseAndMergeWithLogRecords.

private List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> readFromBaseAndMergeWithLogRecords(HoodieFileReader baseFileReader, List<String> keys, Map<String, Option<HoodieRecord<HoodieMetadataPayload>>> logRecords, List<Long> timings, String partitionName) throws IOException {
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> result = new ArrayList<>();
    // merge with base records
    HoodieTimer timer = new HoodieTimer().startTimer();
    timer.startTimer();
    HoodieRecord<HoodieMetadataPayload> hoodieRecord = null;
    // Retrieve record from base file
    if (baseFileReader != null) {
        HoodieTimer readTimer = new HoodieTimer();
        Map<String, GenericRecord> baseFileRecords = baseFileReader.getRecordsByKeys(keys);
        for (String key : keys) {
            readTimer.startTimer();
            if (baseFileRecords.containsKey(key)) {
                hoodieRecord = getRecord(Option.of(baseFileRecords.get(key)), partitionName);
                metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BASEFILE_READ_STR, readTimer.endTimer()));
                // merge base file record w/ log record if present
                if (logRecords.containsKey(key) && logRecords.get(key).isPresent()) {
                    HoodieRecordPayload mergedPayload = logRecords.get(key).get().getData().preCombine(hoodieRecord.getData());
                    result.add(Pair.of(key, Option.of(new HoodieAvroRecord(hoodieRecord.getKey(), mergedPayload))));
                } else {
                    // only base record
                    result.add(Pair.of(key, Option.of(hoodieRecord)));
                }
            } else {
                // only log record
                result.add(Pair.of(key, logRecords.get(key)));
            }
        }
        timings.add(timer.endTimer());
    } else {
        // no base file at all
        timings.add(timer.endTimer());
        for (Map.Entry<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : logRecords.entrySet()) {
            result.add(Pair.of(entry.getKey(), entry.getValue()));
        }
    }
    return result;
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Option(org.apache.hudi.common.util.Option) GenericRecord(org.apache.avro.generic.GenericRecord) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Pair(org.apache.hudi.common.util.collection.Pair)

Example 78 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class MergeOnReadInputFormat method getLogFileIterator.

private ClosableIterator<RowData> getLogFileIterator(MergeOnReadInputSplit split) {
    final Schema tableSchema = new Schema.Parser().parse(tableState.getAvroSchema());
    final Schema requiredSchema = new Schema.Parser().parse(tableState.getRequiredAvroSchema());
    final GenericRecordBuilder recordBuilder = new GenericRecordBuilder(requiredSchema);
    final AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(tableState.getRequiredRowType());
    final HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(split, tableSchema, hadoopConf, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED));
    final Iterator<String> logRecordsKeyIterator = scanner.getRecords().keySet().iterator();
    final int[] pkOffset = tableState.getPkOffsetsInRequired();
    // flag saying whether the pk semantics has been dropped by user specified
    // projections. For e.g, if the pk fields are [a, b] but user only select a,
    // then the pk semantics is lost.
    final boolean pkSemanticLost = Arrays.stream(pkOffset).anyMatch(offset -> offset == -1);
    final LogicalType[] pkTypes = pkSemanticLost ? null : tableState.getPkTypes(pkOffset);
    final StringToRowDataConverter converter = pkSemanticLost ? null : new StringToRowDataConverter(pkTypes);
    return new ClosableIterator<RowData>() {

        private RowData currentRecord;

        @Override
        public boolean hasNext() {
            while (logRecordsKeyIterator.hasNext()) {
                String curAvroKey = logRecordsKeyIterator.next();
                Option<IndexedRecord> curAvroRecord = null;
                final HoodieAvroRecord<?> hoodieRecord = (HoodieAvroRecord) scanner.getRecords().get(curAvroKey);
                try {
                    curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema);
                } catch (IOException e) {
                    throw new HoodieException("Get avro insert value error for key: " + curAvroKey, e);
                }
                if (!curAvroRecord.isPresent()) {
                    // delete record found
                    if (emitDelete && !pkSemanticLost) {
                        GenericRowData delete = new GenericRowData(tableState.getRequiredRowType().getFieldCount());
                        final String recordKey = hoodieRecord.getRecordKey();
                        final String[] pkFields = KeyGenUtils.extractRecordKeys(recordKey);
                        final Object[] converted = converter.convert(pkFields);
                        for (int i = 0; i < pkOffset.length; i++) {
                            delete.setField(pkOffset[i], converted[i]);
                        }
                        delete.setRowKind(RowKind.DELETE);
                        this.currentRecord = delete;
                        return true;
                    }
                // skipping if the condition is unsatisfied
                // continue;
                } else {
                    final IndexedRecord avroRecord = curAvroRecord.get();
                    final RowKind rowKind = FormatUtils.getRowKindSafely(avroRecord, tableState.getOperationPos());
                    if (rowKind == RowKind.DELETE && !emitDelete) {
                        // skip the delete record
                        continue;
                    }
                    GenericRecord requiredAvroRecord = buildAvroRecordBySchema(avroRecord, requiredSchema, requiredPos, recordBuilder);
                    currentRecord = (RowData) avroToRowDataConverter.convert(requiredAvroRecord);
                    currentRecord.setRowKind(rowKind);
                    return true;
                }
            }
            return false;
        }

        @Override
        public RowData next() {
            return currentRecord;
        }

        @Override
        public void close() {
            scanner.close();
        }
    };
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) FormatUtils.buildAvroRecordBySchema(org.apache.hudi.table.format.FormatUtils.buildAvroRecordBySchema) Schema(org.apache.avro.Schema) LogicalType(org.apache.flink.table.types.logical.LogicalType) StringToRowDataConverter(org.apache.hudi.util.StringToRowDataConverter) HoodieException(org.apache.hudi.exception.HoodieException) AvroToRowDataConverters(org.apache.hudi.util.AvroToRowDataConverters) GenericRowData(org.apache.flink.table.data.GenericRowData) RowData(org.apache.flink.table.data.RowData) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRowData(org.apache.flink.table.data.GenericRowData) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) IOException(java.io.IOException) RowKind(org.apache.flink.types.RowKind)

Example 79 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class MergeOnReadInputFormat method getUnMergedLogFileIterator.

private ClosableIterator<RowData> getUnMergedLogFileIterator(MergeOnReadInputSplit split) {
    final Schema tableSchema = new Schema.Parser().parse(tableState.getAvroSchema());
    final Schema requiredSchema = new Schema.Parser().parse(tableState.getRequiredAvroSchema());
    final GenericRecordBuilder recordBuilder = new GenericRecordBuilder(requiredSchema);
    final AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(tableState.getRequiredRowType());
    final FormatUtils.BoundedMemoryRecords records = new FormatUtils.BoundedMemoryRecords(split, tableSchema, hadoopConf, conf);
    final Iterator<HoodieRecord<?>> recordsIterator = records.getRecordsIterator();
    return new ClosableIterator<RowData>() {

        private RowData currentRecord;

        @Override
        public boolean hasNext() {
            while (recordsIterator.hasNext()) {
                Option<IndexedRecord> curAvroRecord = null;
                final HoodieAvroRecord<?> hoodieRecord = (HoodieAvroRecord) recordsIterator.next();
                try {
                    curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema);
                } catch (IOException e) {
                    throw new HoodieException("Get avro insert value error for key: " + hoodieRecord.getRecordKey(), e);
                }
                if (curAvroRecord.isPresent()) {
                    final IndexedRecord avroRecord = curAvroRecord.get();
                    GenericRecord requiredAvroRecord = buildAvroRecordBySchema(avroRecord, requiredSchema, requiredPos, recordBuilder);
                    currentRecord = (RowData) avroToRowDataConverter.convert(requiredAvroRecord);
                    FormatUtils.setRowKind(currentRecord, avroRecord, tableState.getOperationPos());
                    return true;
                }
            }
            return false;
        }

        @Override
        public RowData next() {
            return currentRecord;
        }

        @Override
        public void close() {
            records.close();
        }
    };
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FormatUtils.buildAvroRecordBySchema(org.apache.hudi.table.format.FormatUtils.buildAvroRecordBySchema) Schema(org.apache.avro.Schema) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) AvroToRowDataConverters(org.apache.hudi.util.AvroToRowDataConverters) GenericRowData(org.apache.flink.table.data.GenericRowData) RowData(org.apache.flink.table.data.RowData) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FormatUtils(org.apache.hudi.table.format.FormatUtils) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 80 with HoodieAvroRecord

use of org.apache.hudi.common.model.HoodieAvroRecord in project hudi by apache.

the class BucketAssignFunction method processRecord.

@SuppressWarnings("unchecked")
private void processRecord(HoodieRecord<?> record, Collector<O> out) throws Exception {
    // 1. put the record into the BucketAssigner;
    // 2. look up the state for location, if the record has a location, just send it out;
    // 3. if it is an INSERT, decide the location using the BucketAssigner then send it out.
    final HoodieKey hoodieKey = record.getKey();
    final String recordKey = hoodieKey.getRecordKey();
    final String partitionPath = hoodieKey.getPartitionPath();
    final HoodieRecordLocation location;
    // Only changing records need looking up the index for the location,
    // append only records are always recognized as INSERT.
    HoodieRecordGlobalLocation oldLoc = indexState.value();
    if (isChangingRecords && oldLoc != null) {
        // Set up the instant time as "U" to mark the bucket as an update bucket.
        if (!Objects.equals(oldLoc.getPartitionPath(), partitionPath)) {
            if (globalIndex) {
                // if partition path changes, emit a delete record for old partition path,
                // then update the index state using location with new partition path.
                HoodieRecord<?> deleteRecord = new HoodieAvroRecord<>(new HoodieKey(recordKey, oldLoc.getPartitionPath()), payloadCreation.createDeletePayload((BaseAvroPayload) record.getData()));
                deleteRecord.setCurrentLocation(oldLoc.toLocal("U"));
                deleteRecord.seal();
                out.collect((O) deleteRecord);
            }
            location = getNewRecordLocation(partitionPath);
            updateIndexState(partitionPath, location);
        } else {
            location = oldLoc.toLocal("U");
            this.bucketAssigner.addUpdate(partitionPath, location.getFileId());
        }
    } else {
        location = getNewRecordLocation(partitionPath);
    }
    // always refresh the index
    if (isChangingRecords) {
        updateIndexState(partitionPath, location);
    }
    record.setCurrentLocation(location);
    out.collect((O) record);
}
Also used : BaseAvroPayload(org.apache.hudi.common.model.BaseAvroPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecordGlobalLocation(org.apache.hudi.common.model.HoodieRecordGlobalLocation) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Aggregations

HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)84 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)72 HoodieKey (org.apache.hudi.common.model.HoodieKey)68 ArrayList (java.util.ArrayList)38 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)37 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)31 Test (org.junit.jupiter.api.Test)30 GenericRecord (org.apache.avro.generic.GenericRecord)29 Path (org.apache.hadoop.fs.Path)26 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)25 IOException (java.io.IOException)24 HoodieTable (org.apache.hudi.table.HoodieTable)24 List (java.util.List)23 Schema (org.apache.avro.Schema)23 HashMap (java.util.HashMap)22 Pair (org.apache.hudi.common.util.collection.Pair)21 Map (java.util.Map)20 Collectors (java.util.stream.Collectors)20 Arrays (java.util.Arrays)17 Option (org.apache.hudi.common.util.Option)16