Search in sources :

Example 41 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestKafkaOffsetGen method testGetNextOffsetRangesFromEarliest.

@Test
public void testGetNextOffsetRangesFromEarliest() {
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    testUtils.createTopic(TEST_TOPIC_NAME, 1);
    testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
    KafkaOffsetGen kafkaOffsetGen = new KafkaOffsetGen(getConsumerConfigs("earliest", "string"));
    OffsetRange[] nextOffsetRanges = kafkaOffsetGen.getNextOffsetRanges(Option.empty(), 500, metrics);
    assertEquals(1, nextOffsetRanges.length);
    assertEquals(0, nextOffsetRanges[0].fromOffset());
    assertEquals(500, nextOffsetRanges[0].untilOffset());
    nextOffsetRanges = kafkaOffsetGen.getNextOffsetRanges(Option.empty(), 5000, metrics);
    assertEquals(1, nextOffsetRanges.length);
    assertEquals(0, nextOffsetRanges[0].fromOffset());
    assertEquals(1000, nextOffsetRanges[0].untilOffset());
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 42 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestKafkaOffsetGen method testGetNextOffsetRangesFromMultiplePartitions.

@Test
public void testGetNextOffsetRangesFromMultiplePartitions() {
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    testUtils.createTopic(TEST_TOPIC_NAME, 2);
    testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
    KafkaOffsetGen kafkaOffsetGen = new KafkaOffsetGen(getConsumerConfigs("earliest", "string"));
    OffsetRange[] nextOffsetRanges = kafkaOffsetGen.getNextOffsetRanges(Option.empty(), 499, metrics);
    assertEquals(2, nextOffsetRanges.length);
    assertEquals(0, nextOffsetRanges[0].fromOffset());
    assertEquals(250, nextOffsetRanges[0].untilOffset());
    assertEquals(0, nextOffsetRanges[1].fromOffset());
    assertEquals(249, nextOffsetRanges[1].untilOffset());
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 43 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testDeletesWithoutInserts.

/**
 * Test delete with delete api.
 */
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testDeletesWithoutInserts(boolean populateMetaFields) {
    final String testPartitionPath = "2016/09/26";
    final int insertSplitLimit = 100;
    // setup the small file handling params
    HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
    dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    SparkRDDWriteClient client = getHoodieWriteClient(config);
    // delete non existent keys
    String commitTime1 = "001";
    client.startCommitWithTime(commitTime1);
    List<HoodieRecord> dummyInserts = dataGen.generateInserts(commitTime1, 20);
    List<HoodieKey> hoodieKeysToDelete = randomSelectAsHoodieKeys(dummyInserts, 20);
    JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(hoodieKeysToDelete, 1);
    assertThrows(HoodieIOException.class, () -> {
        client.delete(deleteKeys, commitTime1).collect();
    }, "Should have thrown Exception");
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 44 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testDeletesWithDeleteApi.

/**
 * Test delete with delete api.
 */
@Test
public void testDeletesWithDeleteApi() throws Exception {
    final String testPartitionPath = "2016/09/26";
    final int insertSplitLimit = 100;
    // setup the small file handling params
    // hold upto 200 records max
    HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
    dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    SparkRDDWriteClient client = getHoodieWriteClient(config);
    // Inserts => will write file1
    String commitTime1 = "001";
    client.startCommitWithTime(commitTime1);
    // this writes ~500kb
    List<HoodieRecord> inserts1 = dataGen.generateInserts(commitTime1, insertSplitLimit);
    Set<String> keys1 = recordsToRecordKeySet(inserts1);
    List<String> keysSoFar = new ArrayList<>(keys1);
    JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 1);
    List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
    assertNoWriteErrors(statuses);
    assertEquals(1, statuses.size(), "Just 1 file needs to be added.");
    String file1 = statuses.get(0).getFileId();
    assertEquals(100, BaseFileUtils.getInstance(metaClient).readRowKeys(hadoopConf, new Path(basePath, statuses.get(0).getStat().getPath())).size(), "file should contain 100 records");
    // Delete 20 among 100 inserted
    testDeletes(client, inserts1, 20, file1, "002", 80, keysSoFar);
    // Insert and update 40 records
    Pair<Set<String>, List<HoodieRecord>> updateBatch2 = testUpdates("003", client, 40, 120);
    keysSoFar.addAll(updateBatch2.getLeft());
    // Delete 10 records among 40 updated
    testDeletes(client, updateBatch2.getRight(), 10, file1, "004", 110, keysSoFar);
    // do another batch of updates
    Pair<Set<String>, List<HoodieRecord>> updateBatch3 = testUpdates("005", client, 40, 150);
    keysSoFar.addAll(updateBatch3.getLeft());
    // delete non existent keys
    String commitTime6 = "006";
    client.startCommitWithTime(commitTime6);
    List<HoodieRecord> dummyInserts3 = dataGen.generateInserts(commitTime6, 20);
    List<HoodieKey> hoodieKeysToDelete3 = randomSelectAsHoodieKeys(dummyInserts3, 20);
    JavaRDD<HoodieKey> deleteKeys3 = jsc.parallelize(hoodieKeysToDelete3, 1);
    statuses = client.delete(deleteKeys3, commitTime6).collect();
    assertNoWriteErrors(statuses);
    assertEquals(0, statuses.size(), "Just 0 write status for delete.");
    // Check the entire dataset has all records still
    String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
    for (int i = 0; i < fullPartitionPaths.length; i++) {
        fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]);
    }
    assertEquals(150, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(), "Must contain " + 150 + " records");
    // delete another batch. previous delete commit should have persisted the schema. If not,
    // this will throw exception
    testDeletes(client, updateBatch3.getRight(), 10, file1, "007", 140, keysSoFar);
}
Also used : Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Set(java.util.Set) Transformations.recordsToRecordKeySet(org.apache.hudi.common.testutils.Transformations.recordsToRecordKeySet) HashSet(java.util.HashSet) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 45 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testRollbackFailedCommits.

@ParameterizedTest
@MethodSource("rollbackFailedCommitsParams")
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) throws Exception {
    HoodieTestUtils.init(hadoopConf, basePath);
    SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    // perform 1 successfull commit
    writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
    // Perform 2 failed writes to table
    writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
    client.close();
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
    client.close();
    // refresh data generator to delete records generated from failed commits
    dataGen = new HoodieTestDataGenerator();
    // Perform 1 successful write
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
    assertTrue(metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0);
    assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
    assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 2);
    // Await till enough time passes such that the first 2 failed commits heartbeats are expired
    boolean conditionMet = false;
    while (!conditionMet) {
        conditionMet = client.getHeartbeatClient().isHeartbeatExpired("300");
        Thread.sleep(2000);
    }
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    // Perform 1 successful write
    writeBatch(client, "500", "400", Option.of(Arrays.asList("500")), "500", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
    client.clean();
    HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
    if (cleaningPolicy.isLazy()) {
        assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 2);
        // Since we write rollbacks not clean, there should be no clean action on the timeline
        assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0);
        assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3);
    } else if (cleaningPolicy.isNever()) {
        assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0);
        // There should be no clean or rollback action on the timeline
        assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0);
        assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3);
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)97 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)52 Test (org.junit.jupiter.api.Test)51 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)44 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)38 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)31 TypedProperties (org.apache.hudi.common.config.TypedProperties)29 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 GenericRecord (org.apache.avro.generic.GenericRecord)25 JavaRDD (org.apache.spark.api.java.JavaRDD)25 Path (org.apache.hadoop.fs.Path)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 ArrayList (java.util.ArrayList)21 Properties (java.util.Properties)21 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)18 HoodieTable (org.apache.hudi.table.HoodieTable)18 List (java.util.List)17 ValueSource (org.junit.jupiter.params.provider.ValueSource)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)16 IOException (java.io.IOException)15