Search in sources :

Example 1 with WorkloadProfile

use of org.apache.hudi.table.WorkloadProfile in project hudi by apache.

the class FlinkDeleteHelper method execute.

@Override
public HoodieWriteMetadata<List<WriteStatus>> execute(String instantTime, List<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>> table, BaseCommitActionExecutor<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>, R> deleteExecutor) {
    try {
        HoodieWriteMetadata<List<WriteStatus>> result = null;
        List<HoodieKey> dedupedKeys = keys;
        final int parallelism = config.getDeleteShuffleParallelism();
        if (config.shouldCombineBeforeDelete()) {
            // De-dupe/merge if needed
            dedupedKeys = deduplicateKeys(keys, table, parallelism);
        }
        List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords = dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
        Instant beginTag = Instant.now();
        // perform index look up to get existing location of records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
        Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
        // filter out non existent keys/records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList());
        if (!taggedValidRecords.isEmpty()) {
            result = deleteExecutor.execute(taggedValidRecords);
            result.setIndexLookupDuration(tagLocationDuration);
        } else {
            // if entire set of keys are non existent
            deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
            result = new HoodieWriteMetadata<>();
            result.setWriteStatuses(Collections.EMPTY_LIST);
            deleteExecutor.commitOnAutoCommit(result);
        }
        return result;
    } catch (Throwable e) {
        if (e instanceof HoodieUpsertException) {
            throw (HoodieUpsertException) e;
        }
        throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieList(org.apache.hudi.common.data.HoodieList) HashSet(java.util.HashSet) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Duration(java.time.Duration) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieKey(org.apache.hudi.common.model.HoodieKey) WorkloadStat(org.apache.hudi.table.WorkloadStat) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Instant(java.time.Instant) Duration(java.time.Duration) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) LinkedList(java.util.LinkedList)

Example 2 with WorkloadProfile

use of org.apache.hudi.table.WorkloadProfile in project hudi by apache.

the class TestUpsertPartitioner method testPartitionWeight.

@Test
public void testPartitionWeight() throws Exception {
    final String testPartitionPath = "2016/09/26";
    int totalInsertNum = 2000;
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(0).insertSplitSize(totalInsertNum / 2).autoTuneInsertSplits(false).build()).build();
    FileCreateUtils.createCommit(basePath, "001");
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", totalInsertNum);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    List<InsertBucketCumulativeWeightPair> insertBuckets = partitioner.getInsertBuckets(testPartitionPath);
    float bucket0Weight = 0.2f;
    InsertBucketCumulativeWeightPair pair = insertBuckets.remove(0);
    pair.getKey().weight = bucket0Weight;
    pair.setValue(new Double(bucket0Weight));
    insertBuckets.add(0, pair);
    InsertBucketCumulativeWeightPair pair1 = insertBuckets.remove(1);
    pair1.getKey().weight = 1 - bucket0Weight;
    pair1.setValue(new Double(1));
    insertBuckets.add(1, pair1);
    Map<Integer, Integer> partition2numRecords = new HashMap<Integer, Integer>();
    for (HoodieRecord hoodieRecord : insertRecords) {
        int partition = partitioner.getPartition(new Tuple2<>(hoodieRecord.getKey(), Option.ofNullable(hoodieRecord.getCurrentLocation())));
        if (!partition2numRecords.containsKey(partition)) {
            partition2numRecords.put(partition, 0);
        }
        partition2numRecords.put(partition, partition2numRecords.get(partition) + 1);
    }
    assertTrue(partition2numRecords.get(0) < partition2numRecords.get(1), "The insert num of bucket1 should more than bucket0");
    assertTrue(partition2numRecords.get(0) + partition2numRecords.get(1) == totalInsertNum, "The total insert records should be " + totalInsertNum);
    assertEquals(String.valueOf(bucket0Weight), String.format("%.1f", (partition2numRecords.get(0) * 1.0f / totalInsertNum)), "The weight of bucket0 should be " + bucket0Weight);
    assertEquals(String.valueOf(1 - bucket0Weight), String.format("%.1f", (partition2numRecords.get(1) * 1.0f / totalInsertNum)), "The weight of bucket1 should be " + (1 - bucket0Weight));
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 3 with WorkloadProfile

use of org.apache.hudi.table.WorkloadProfile in project hudi by apache.

the class TestUpsertPartitioner method getUpsertPartitioner.

private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).orcMaxFileSize(1000 * 1024).build()).build();
    FileCreateUtils.createCommit(basePath, "001");
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "001", "file1", fileSize);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
    List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
    for (HoodieRecord updateRec : updateRecords) {
        updateRec.unseal();
        updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
        updateRec.seal();
    }
    List<HoodieRecord> records = new ArrayList<>();
    records.addAll(insertRecords);
    records.addAll(updateRecords);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(records)));
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    assertEquals(0, partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.ofNullable(updateRecords.get(0).getCurrentLocation()))), "Update record should have gone to the 1 update partition");
    return partitioner;
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 4 with WorkloadProfile

use of org.apache.hudi.table.WorkloadProfile in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles.

@Test
public void testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles() throws Exception {
    // Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan()
    final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()).build()).build();
    // This will generate initial commits and create a compaction plan which includes file groups created as part of this
    HoodieCompactionPlan plan = CompactionTestUtils.createCompactionPlan(metaClient, "001", "002", 1, true, false);
    FileCreateUtils.createRequestedCompactionCommit(basePath, "002", plan);
    // Simulate one more commit so that inflight compaction is considered when building file groups in file system view
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "2", 1);
    FileCreateUtils.createCommit(basePath, "003");
    // Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config);
    assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions");
    assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, "Bucket 0 is UPDATE");
    assertEquals("2", partitioner.getBucketInfo(0).fileIdPrefix, "Should be assigned to only file id not pending compaction which is 2");
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkUpsertDeltaCommitPartitioner(org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Example 5 with WorkloadProfile

use of org.apache.hudi.table.WorkloadProfile in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan.

@Test
public void testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan() throws Exception {
    final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
    // create HoodieWriteConfig and set inline and async clustering disable here.
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(false).withAsyncClustering(false).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).build()).build();
    // create file slice with instantTime 001 and build clustering plan including this created 001 file slice.
    HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, "001", "1");
    // create requested replace commit
    HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build();
    FileCreateUtils.createRequestedReplaceCommit(basePath, "002", Option.of(requestedReplaceMetadata));
    // create file slice 003
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "3", 1);
    FileCreateUtils.createCommit(basePath, "003");
    metaClient = HoodieTableMetaClient.reload(metaClient);
    // generate new data to be ingested
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    // create UpsertPartitioner
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    // for now we have file slice1 and file slice3 and file slice1 is contained in pending clustering plan
    // So that only file slice3 can be used for ingestion.
    assertEquals(1, partitioner.smallFiles.size(), "Should have 1 small file to be ingested.");
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Aggregations

WorkloadProfile (org.apache.hudi.table.WorkloadProfile)12 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)9 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)8 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)6 HoodieWriteMetadata (org.apache.hudi.table.action.HoodieWriteMetadata)5 Test (org.junit.jupiter.api.Test)5 List (java.util.List)4 WriteStatus (org.apache.hudi.client.WriteStatus)4 HoodieUpsertException (org.apache.hudi.exception.HoodieUpsertException)4 WorkloadStat (org.apache.hudi.table.WorkloadStat)4 Duration (java.time.Duration)3 Instant (java.time.Instant)3 HashMap (java.util.HashMap)3 LinkedList (java.util.LinkedList)3 HoodieData (org.apache.hudi.common.data.HoodieData)3 HoodieList (org.apache.hudi.common.data.HoodieList)3 EmptyHoodieRecordPayload (org.apache.hudi.common.model.EmptyHoodieRecordPayload)3 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)3