Examples with HoodieTestDataGenerator - org.apache.hudi.common.testutils.HoodieTestDataGenerator

Example 1 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestHoodieMergeOnReadTable method setUp.

@BeforeEach
void setUp() throws IOException {
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
    metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    dataGen = new HoodieTestDataGenerator();
}

Also used : Properties(java.util.Properties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) BeforeEach(org.junit.jupiter.api.BeforeEach)

Example 2 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestUpsertPartitioner method testPartitionWeight.

@Test
public void testPartitionWeight() throws Exception {
    final String testPartitionPath = "2016/09/26";
    int totalInsertNum = 2000;
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(0).insertSplitSize(totalInsertNum / 2).autoTuneInsertSplits(false).build()).build();
    FileCreateUtils.createCommit(basePath, "001");
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", totalInsertNum);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    List<InsertBucketCumulativeWeightPair> insertBuckets = partitioner.getInsertBuckets(testPartitionPath);
    float bucket0Weight = 0.2f;
    InsertBucketCumulativeWeightPair pair = insertBuckets.remove(0);
    pair.getKey().weight = bucket0Weight;
    pair.setValue(new Double(bucket0Weight));
    insertBuckets.add(0, pair);
    InsertBucketCumulativeWeightPair pair1 = insertBuckets.remove(1);
    pair1.getKey().weight = 1 - bucket0Weight;
    pair1.setValue(new Double(1));
    insertBuckets.add(1, pair1);
    Map<Integer, Integer> partition2numRecords = new HashMap<Integer, Integer>();
    for (HoodieRecord hoodieRecord : insertRecords) {
        int partition = partitioner.getPartition(new Tuple2<>(hoodieRecord.getKey(), Option.ofNullable(hoodieRecord.getCurrentLocation())));
        if (!partition2numRecords.containsKey(partition)) {
            partition2numRecords.put(partition, 0);
        }
        partition2numRecords.put(partition, partition2numRecords.get(partition) + 1);
    }
    assertTrue(partition2numRecords.get(0) < partition2numRecords.get(1), "The insert num of bucket1 should more than bucket0");
    assertTrue(partition2numRecords.get(0) + partition2numRecords.get(1) == totalInsertNum, "The total insert records should be " + totalInsertNum);
    assertEquals(String.valueOf(bucket0Weight), String.format("%.1f", (partition2numRecords.get(0) * 1.0f / totalInsertNum)), "The weight of bucket0 should be " + bucket0Weight);
    assertEquals(String.valueOf(1 - bucket0Weight), String.format("%.1f", (partition2numRecords.get(1) * 1.0f / totalInsertNum)), "The weight of bucket1 should be " + (1 - bucket0Weight));
}

Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 3 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestUpsertPartitioner method getUpsertPartitioner.

private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).orcMaxFileSize(1000 * 1024).build()).build();
    FileCreateUtils.createCommit(basePath, "001");
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "001", "file1", fileSize);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
    List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
    for (HoodieRecord updateRec : updateRecords) {
        updateRec.unseal();
        updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
        updateRec.seal();
    }
    List<HoodieRecord> records = new ArrayList<>();
    records.addAll(insertRecords);
    records.addAll(updateRecords);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(records)));
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    assertEquals(0, partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.ofNullable(updateRecords.get(0).getCurrentLocation()))), "Update record should have gone to the 1 update partition");
    return partitioner;
}

Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 4 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles.

@Test
public void testUpsertPartitionerWithSmallFileHandlingWithInflightCompactionWithCanIndexLogFiles() throws Exception {
    // Note this is used because it is same partition path used in CompactionTestUtils.createCompactionPlan()
    final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE).withHBaseIndexConfig(HoodieHBaseIndexConfig.newBuilder().build()).build()).build();
    // This will generate initial commits and create a compaction plan which includes file groups created as part of this
    HoodieCompactionPlan plan = CompactionTestUtils.createCompactionPlan(metaClient, "001", "002", 1, true, false);
    FileCreateUtils.createRequestedCompactionCommit(basePath, "002", plan);
    // Simulate one more commit so that inflight compaction is considered when building file groups in file system view
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "2", 1);
    FileCreateUtils.createCommit(basePath, "003");
    // Partitioner will attempt to assign inserts to file groups including base file created by inflight compaction
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    SparkUpsertDeltaCommitPartitioner partitioner = new SparkUpsertDeltaCommitPartitioner(profile, context, table, config);
    assertEquals(1, partitioner.numPartitions(), "Should have 1 partitions");
    assertEquals(BucketType.UPDATE, partitioner.getBucketInfo(0).bucketType, "Bucket 0 is UPDATE");
    assertEquals("2", partitioner.getBucketInfo(0).fileIdPrefix, "Should be assigned to only file id not pending compaction which is 2");
}

Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SparkUpsertDeltaCommitPartitioner(org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitPartitioner) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Example 5 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestUpsertPartitioner method testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan.

@Test
public void testUpsertPartitionerWithSmallFileHandlingAndClusteringPlan() throws Exception {
    final String testPartitionPath = DEFAULT_PARTITION_PATHS[0];
    // create HoodieWriteConfig and set inline and async clustering disable here.
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withInlineClustering(false).withAsyncClustering(false).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).build()).build();
    // create file slice with instantTime 001 and build clustering plan including this created 001 file slice.
    HoodieClusteringPlan clusteringPlan = ClusteringTestUtils.createClusteringPlan(metaClient, "001", "1");
    // create requested replace commit
    HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(clusteringPlan).setOperationType(WriteOperationType.CLUSTER.name()).build();
    FileCreateUtils.createRequestedReplaceCommit(basePath, "002", Option.of(requestedReplaceMetadata));
    // create file slice 003
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "003", "3", 1);
    FileCreateUtils.createCommit(basePath, "003");
    metaClient = HoodieTableMetaClient.reload(metaClient);
    // generate new data to be ingested
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("004", 100);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(insertRecords)));
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    // create UpsertPartitioner
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    // for now we have file slice1 and file slice3 and file slice1 is contained in pending clustering plan
    // So that only file slice3 can be used for ingestion.
    assertEquals(1, partitioner.smallFiles.size(), "Should have 1 small file to be ingested.");
}

Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)97 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)52 Test (org.junit.jupiter.api.Test)51 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)44 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)38 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)31 TypedProperties (org.apache.hudi.common.config.TypedProperties)29 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 GenericRecord (org.apache.avro.generic.GenericRecord)25 JavaRDD (org.apache.spark.api.java.JavaRDD)25 Path (org.apache.hadoop.fs.Path)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 ArrayList (java.util.ArrayList)21 Properties (java.util.Properties)21 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)18 HoodieTable (org.apache.hudi.table.HoodieTable)18 List (java.util.List)17 ValueSource (org.junit.jupiter.params.provider.ValueSource)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)16 IOException (java.io.IOException)15