Search in sources :

Example 1 with SparkUpsertDeltaCommitPartitioner

use of org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles.

@Test
public void testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles() throws Exception {
    // Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan()
    final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()).build()).build();
    // This will generate initial commits and create a compaction plan which includes file groups created as part of this
    HoodieCompactionPlan plan = CompactionTestUtils.createCompactionPlan(metaClient, "001", "002", 1, true, false);
    FileCreateUtils.createRequestedCompactionCommit(basePath, "002", plan);
    // Simulate one more commit so that inflight compaction is considered when building file groups in file system view
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "2", 1);
    FileCreateUtils.createCommit(basePath, "003");
    // Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config);
    assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions");
    assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, "Bucket 0 is UPDATE");
    assertEquals("2", partitioner.getBucketInfo(0).fileIdPrefix, "Should be assigned to only file id not pending compaction which is 2");
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkUpsertDeltaCommitPartitioner(org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Example 2 with SparkUpsertDeltaCommitPartitioner

use of org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingWithCanIndexLogFiles.

@Test
public void testUpsertPartitionerWithSmallFileHandlingWithCanIndexLogFiles() throws Exception {
    // Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan()
    final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1024).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()).build()).build();
    // Create file group with only one log file
    FileCreateUtils.createLogFile(basePath, testPartitionPath, "001", "fg1", 1);
    FileCreateUtils.createDeltaCommit(basePath, "001");
    // Create another file group size set to max parquet file size so should not be considered during small file sizing
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "002", "fg2", 1024);
    FileCreateUtils.createCommit(basePath, "002");
    FileCreateUtils.createLogFile(basePath, testPartitionPath, "003", "fg2", 1);
    FileCreateUtils.createDeltaCommit(basePath, "003");
    // Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    // Default estimated record size will be 1024 based on last file group created. Only 1 record can be added to small file
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 1);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config);
    assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions");
    assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, "Bucket 0 should be UPDATE");
    assertEquals("fg1", partitioner.getBucketInfo(0).fileIdPrefix, "Insert should be assigned to fg1");
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkUpsertDeltaCommitPartitioner(org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Example 3 with SparkUpsertDeltaCommitPartitioner

use of org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingPickingMultipleCandidates.

@Test
public void testUpsertPartitionerWithSmallFileHandlingPickingMultipleCandidates() throws Exception {
    final String partitionPath = DEFAULT_PARTITION_PATHS[0];
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMergeSmallFileGroupCandidatesLimit(3).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(2048).build()).build();
    // Bootstrap base files ("small-file targets")
    FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-1", 1024);
    FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-2", 1024);
    FileCreateUtils.createBaseFile(basePath, partitionPath, "002", "fg-3", 1024);
    FileCreateUtils.createCommit(basePath, "002");
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { partitionPath });
    // Default estimated record size will be 1024 based on last file group created.
    // Only 1 record can be added to small file
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(dataGenerator.generateInserts("003", 3))));
    HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(this.metaClient);
    HoodieSparkTable<?> table = HoodieSparkTable.create(config, context, reloadedMetaClient);
    SparkUpsertDeltaCommitPartitioner<?> partitioner = new SparkUpsertDeltaCommitPartitioner<>(profile, context, table, config);
    assertEquals(3, partitioner.numPartitions());
    assertEquals(Arrays.asList(new BucketInfo(BucketType.UPDATE, "fg-1", partitionPath), new BucketInfo(BucketType.UPDATE, "fg-2", partitionPath), new BucketInfo(BucketType.UPDATE, "fg-3", partitionPath)), partitioner.getBucketInfos());
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkUpsertDeltaCommitPartitioner(org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)3 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)3 WorkloadProfile (org.apache.hudi.table.WorkloadProfile)3 SparkUpsertDeltaCommitPartitioner (org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner)3 Test (org.junit.jupiter.api.Test)3 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)2 HoodieCompactionPlan (org.apache.hudi.avro.model.HoodieCompactionPlan)1 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)1