Search in sources :

Example 1 with EmptyHoodieRecordPayload

use of org.apache.hudi.common.model.EmptyHoodieRecordPayload in project hudi by apache.

the class FlinkDeleteHelper method execute.

@Override
public HoodieWriteMetadata<List<WriteStatus>> execute(String instantTime, List<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>> table, BaseCommitActionExecutor<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>, R> deleteExecutor) {
    try {
        HoodieWriteMetadata<List<WriteStatus>> result = null;
        List<HoodieKey> dedupedKeys = keys;
        final int parallelism = config.getDeleteShuffleParallelism();
        if (config.shouldCombineBeforeDelete()) {
            // De-dupe/merge if needed
            dedupedKeys = deduplicateKeys(keys, table, parallelism);
        }
        List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords = dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
        Instant beginTag = Instant.now();
        // perform index look up to get existing location of records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
        Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
        // filter out non existent keys/records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList());
        if (!taggedValidRecords.isEmpty()) {
            result = deleteExecutor.execute(taggedValidRecords);
            result.setIndexLookupDuration(tagLocationDuration);
        } else {
            // if entire set of keys are non existent
            deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
            result = new HoodieWriteMetadata<>();
            result.setWriteStatuses(Collections.EMPTY_LIST);
            deleteExecutor.commitOnAutoCommit(result);
        }
        return result;
    } catch (Throwable e) {
        if (e instanceof HoodieUpsertException) {
            throw (HoodieUpsertException) e;
        }
        throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieList(org.apache.hudi.common.data.HoodieList) HashSet(java.util.HashSet) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Duration(java.time.Duration) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieKey(org.apache.hudi.common.model.HoodieKey) WorkloadStat(org.apache.hudi.table.WorkloadStat) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Instant(java.time.Instant) Duration(java.time.Duration) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) LinkedList(java.util.LinkedList)

Example 2 with EmptyHoodieRecordPayload

use of org.apache.hudi.common.model.EmptyHoodieRecordPayload in project hudi by apache.

the class HoodieGlobalBloomIndex method tagLocationBacktoRecords.

/**
 * Tagging for global index should only consider the record key.
 */
@Override
protected <R> HoodieData<HoodieRecord<R>> tagLocationBacktoRecords(HoodiePairData<HoodieKey, HoodieRecordLocation> keyLocationPairs, HoodieData<HoodieRecord<R>> records) {
    HoodiePairData<String, HoodieRecord<R>> incomingRowKeyRecordPairs = records.mapToPair(record -> new ImmutablePair<>(record.getRecordKey(), record));
    HoodiePairData<String, Pair<HoodieRecordLocation, HoodieKey>> existingRecordKeyToRecordLocationHoodieKeyMap = keyLocationPairs.mapToPair(p -> new ImmutablePair<>(p.getKey().getRecordKey(), new ImmutablePair<>(p.getValue(), p.getKey())));
    // Here as the records might have more data than rowKeys (some rowKeys' fileId is null), so we do left outer join.
    return incomingRowKeyRecordPairs.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> {
        final HoodieRecord<R> hoodieRecord = record.getLeft();
        final Option<Pair<HoodieRecordLocation, HoodieKey>> recordLocationHoodieKeyPair = record.getRight();
        if (recordLocationHoodieKeyPair.isPresent()) {
            // Record key matched to file
            if (config.getBloomIndexUpdatePartitionPath() && !recordLocationHoodieKeyPair.get().getRight().getPartitionPath().equals(hoodieRecord.getPartitionPath())) {
                // Create an empty record to delete the record in the old partition
                HoodieRecord<R> deleteRecord = new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), new EmptyHoodieRecordPayload());
                deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get().getLeft());
                deleteRecord.seal();
                // Tag the incoming record for inserting to the new partition
                HoodieRecord<R> insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty());
                return Arrays.asList(deleteRecord, insertRecord).iterator();
            } else {
                // When it differs, the record will still be updated at its old partition.
                return Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(new HoodieAvroRecord(recordLocationHoodieKeyPair.get().getRight(), (HoodieRecordPayload) hoodieRecord.getData()), Option.ofNullable(recordLocationHoodieKeyPair.get().getLeft()))).iterator();
            }
        } else {
            return Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator();
        }
    });
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 3 with EmptyHoodieRecordPayload

use of org.apache.hudi.common.model.EmptyHoodieRecordPayload in project hudi by apache.

the class JavaDeleteHelper method execute.

@Override
public HoodieWriteMetadata<List<WriteStatus>> execute(String instantTime, List<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>> table, BaseCommitActionExecutor<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>, R> deleteExecutor) {
    try {
        HoodieWriteMetadata<List<WriteStatus>> result = null;
        List<HoodieKey> dedupedKeys = keys;
        final int parallelism = config.getDeleteShuffleParallelism();
        if (config.shouldCombineBeforeDelete()) {
            // De-dupe/merge if needed
            dedupedKeys = deduplicateKeys(keys, table, parallelism);
        }
        List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords = dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
        Instant beginTag = Instant.now();
        // perform index look up to get existing location of records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
        Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
        // filter out non existent keys/records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList());
        if (!taggedValidRecords.isEmpty()) {
            result = deleteExecutor.execute(taggedValidRecords);
            result.setIndexLookupDuration(tagLocationDuration);
        } else {
            // if entire set of keys are non existent
            deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
            result = new HoodieWriteMetadata<>();
            result.setWriteStatuses(Collections.EMPTY_LIST);
            deleteExecutor.commitOnAutoCommit(result);
        }
        return result;
    } catch (Throwable e) {
        if (e instanceof HoodieUpsertException) {
            throw (HoodieUpsertException) e;
        }
        throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieList(org.apache.hudi.common.data.HoodieList) HashSet(java.util.HashSet) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Duration(java.time.Duration) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieKey(org.apache.hudi.common.model.HoodieKey) WorkloadStat(org.apache.hudi.table.WorkloadStat) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Instant(java.time.Instant) Duration(java.time.Duration) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) LinkedList(java.util.LinkedList)

Example 4 with EmptyHoodieRecordPayload

use of org.apache.hudi.common.model.EmptyHoodieRecordPayload in project hudi by apache.

the class HoodieDeleteHelper method execute.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute(String instantTime, HoodieData<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, BaseCommitActionExecutor<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>, R> deleteExecutor) {
    try {
        HoodieData<HoodieKey> dedupedKeys = keys;
        final int parallelism = config.getDeleteShuffleParallelism();
        if (config.shouldCombineBeforeDelete()) {
            // De-dupe/merge if needed
            dedupedKeys = deduplicateKeys(keys, table, parallelism);
        } else if (!keys.isEmpty()) {
            dedupedKeys = keys.repartition(parallelism);
        }
        HoodieData<HoodieRecord<T>> dedupedRecords = dedupedKeys.map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload()));
        Instant beginTag = Instant.now();
        // perform index loop up to get existing location of records
        HoodieData<HoodieRecord<T>> taggedRecords = table.getIndex().tagLocation(dedupedRecords, context, table);
        Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
        // filter out non existent keys/records
        HoodieData<HoodieRecord<T>> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown);
        HoodieWriteMetadata<HoodieData<WriteStatus>> result;
        if (!taggedValidRecords.isEmpty()) {
            result = deleteExecutor.execute(taggedValidRecords);
            result.setIndexLookupDuration(tagLocationDuration);
        } else {
            // if entire set of keys are non existent
            deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
            result = new HoodieWriteMetadata<>();
            result.setWriteStatuses(context.emptyHoodieData());
            deleteExecutor.commitOnAutoCommit(result);
        }
        return result;
    } catch (Throwable e) {
        if (e instanceof HoodieUpsertException) {
            throw (HoodieUpsertException) e;
        }
        throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
    }
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Instant(java.time.Instant) Duration(java.time.Duration) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload)

Example 5 with EmptyHoodieRecordPayload

use of org.apache.hudi.common.model.EmptyHoodieRecordPayload in project hudi by apache.

the class TestHoodieIndex method testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath.

@Test
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() throws Exception {
    setUp(IndexType.GLOBAL_SIMPLE, true, true);
    config = getConfigBuilder().withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).withGlobalSimpleIndexUpdatePartitionPath(true).withBloomIndexUpdatePartitionPath(true).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withMetadataIndexBloomFilter(true).withMetadataIndexColumnStats(true).build()).build();
    writeClient = getHoodieWriteClient(config);
    index = writeClient.getIndex();
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable.getMetaClient(), SCHEMA, metadataWriter);
    final String p1 = "2016/01/31";
    final String p2 = "2016/02/28";
    // Create the original partition, and put a record, along with the meta file
    // "2016/01/31": 1 file (1_0_20160131101010.parquet)
    // this record will be saved in table and will be tagged to an empty record
    RawTripTestPayload originalPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
    HoodieRecord originalRecord = new HoodieAvroRecord(new HoodieKey(originalPayload.getRowKey(), originalPayload.getPartitionPath()), originalPayload);
    /*
    This record has the same record key as originalRecord but different time so different partition
    Because GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true,
    globalBloomIndex should
    - tag the original partition of the originalRecord to an empty record for deletion, and
    - tag the new partition of the incomingRecord
    */
    RawTripTestPayload incomingPayload = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-02-28T03:16:41.415Z\",\"number\":12}");
    HoodieRecord incomingRecord = new HoodieAvroRecord(new HoodieKey(incomingPayload.getRowKey(), incomingPayload.getPartitionPath()), incomingPayload);
    /*
    This record has the same record key as originalRecord and the same partition
    Though GLOBAL_BLOOM_INDEX_SHOULD_UPDATE_PARTITION_PATH = true,
    globalBloomIndex should just tag the original partition
    */
    RawTripTestPayload incomingPayloadSamePartition = new RawTripTestPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T04:16:41.415Z\",\"number\":15}");
    HoodieRecord incomingRecordSamePartition = new HoodieAvroRecord(new HoodieKey(incomingPayloadSamePartition.getRowKey(), incomingPayloadSamePartition.getPartitionPath()), incomingPayloadSamePartition);
    final String file1P1C0 = UUID.randomUUID().toString();
    Map<String, List<Pair<String, Integer>>> c1PartitionToFilesNameLengthMap = new HashMap<>();
    // We have some records to be tagged (two different partitions)
    Path baseFilePath = testTable.forCommit("1000").withInserts(p1, file1P1C0, Collections.singletonList(originalRecord));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    c1PartitionToFilesNameLengthMap.put(p1, Collections.singletonList(Pair.of(file1P1C0, Integer.valueOf((int) baseFileLength))));
    testTable.doWriteOperation("1000", WriteOperationType.INSERT, Arrays.asList(p1), c1PartitionToFilesNameLengthMap, false, false);
    // We have some records to be tagged (two different partitions)
    testTable.withInserts(p1, file1P1C0, originalRecord);
    // test against incoming record with a different partition
    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Collections.singletonList(incomingRecord));
    JavaRDD<HoodieRecord> taggedRecordRDD = tagLocation(index, recordRDD, hoodieTable);
    assertEquals(2, taggedRecordRDD.count());
    for (HoodieRecord record : taggedRecordRDD.collect()) {
        switch(record.getPartitionPath()) {
            case p1:
                assertEquals("000", record.getRecordKey());
                assertTrue(record.getData() instanceof EmptyHoodieRecordPayload);
                break;
            case p2:
                assertEquals("000", record.getRecordKey());
                assertEquals(incomingPayload.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData());
                break;
            default:
                fail(String.format("Should not get partition path: %s", record.getPartitionPath()));
        }
    }
    // test against incoming record with the same partition
    JavaRDD<HoodieRecord> recordRDDSamePartition = jsc.parallelize(Collections.singletonList(incomingRecordSamePartition));
    JavaRDD<HoodieRecord> taggedRecordRDDSamePartition = tagLocation(index, recordRDDSamePartition, hoodieTable);
    assertEquals(1, taggedRecordRDDSamePartition.count());
    HoodieRecord record = taggedRecordRDDSamePartition.first();
    assertEquals("000", record.getRecordKey());
    assertEquals(p1, record.getPartitionPath());
    assertEquals(incomingPayloadSamePartition.getJsonData(), ((RawTripTestPayload) record.getData()).getJsonData());
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

EmptyHoodieRecordPayload (org.apache.hudi.common.model.EmptyHoodieRecordPayload)8 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)8 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)8 HoodieKey (org.apache.hudi.common.model.HoodieKey)7 HashMap (java.util.HashMap)5 Pair (org.apache.hudi.common.util.collection.Pair)5 List (java.util.List)4 HoodieTable (org.apache.hudi.table.HoodieTable)4 Duration (java.time.Duration)3 Instant (java.time.Instant)3 ArrayList (java.util.ArrayList)3 Collections (java.util.Collections)3 HashSet (java.util.HashSet)3 Collectors (java.util.stream.Collectors)3 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)3 HoodieUpsertException (org.apache.hudi.exception.HoodieUpsertException)3 WorkloadProfile (org.apache.hudi.table.WorkloadProfile)3 WorkloadStat (org.apache.hudi.table.WorkloadStat)3 LinkedList (java.util.LinkedList)2 Schema (org.apache.avro.Schema)2