Search in sources :

Example 26 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithKafkaSource.

// 0 corresponds to fg
@Test
public void testMultiTableExecutionWithKafkaSource() throws IOException {
    // create topics for each table
    String topicName1 = "topic" + testNum++;
    String topicName2 = "topic" + testNum;
    testUtils.createTopic(topicName1, 2);
    testUtils.createTopic(topicName2, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false, null);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
    TypedProperties properties = executionContexts.get(1).getProperties();
    properties.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName2);
    executionContexts.get(1).setProperties(properties);
    TypedProperties properties1 = executionContexts.get(0).getProperties();
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName1);
    executionContexts.get(0).setProperties(properties1);
    String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
    String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
    streamer.sync();
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    // insert updates for already existing records in kafka topics
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    streamer.getTableExecutionContexts().get(1).setProperties(properties);
    streamer.getTableExecutionContexts().get(0).setProperties(properties1);
    streamer.sync();
    assertEquals(2, streamer.getSuccessTables().size());
    assertTrue(streamer.getFailedTables().isEmpty());
    // assert the record count matches now
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    testNum++;
}
Also used : JsonKafkaSource(org.apache.hudi.utilities.sources.JsonKafkaSource) HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 27 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class HoodieDeltaStreamerTestBase method prepareORCDFSFiles.

protected static void prepareORCDFSFiles(int numRecords, String baseORCPath, String fileName, boolean useCustomSchema, String schemaStr, Schema schema) throws IOException {
    String path = baseORCPath + "/" + fileName;
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    if (useCustomSchema) {
        Helpers.saveORCToDFS(Helpers.toGenericRecords(dataGenerator.generateInsertsAsPerSchema("000", numRecords, schemaStr), schema), new Path(path), HoodieTestDataGenerator.ORC_TRIP_SCHEMA);
    } else {
        Helpers.saveORCToDFS(Helpers.toGenericRecords(dataGenerator.generateInserts("000", numRecords)), new Path(path));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 28 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class HoodieDeltaStreamerTestBase method prepareParquetDFSFiles.

protected static void prepareParquetDFSFiles(int numRecords, String baseParquetPath, String fileName, boolean useCustomSchema, String schemaStr, Schema schema) throws IOException {
    String path = baseParquetPath + "/" + fileName;
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    if (useCustomSchema) {
        Helpers.saveParquetToDFS(Helpers.toGenericRecords(dataGenerator.generateInsertsAsPerSchema("000", numRecords, schemaStr), schema), new Path(path), HoodieTestDataGenerator.AVRO_TRIP_SCHEMA);
    } else {
        Helpers.saveParquetToDFS(Helpers.toGenericRecords(dataGenerator.generateInserts("000", numRecords)), new Path(path));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 29 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestJsonKafkaSource method testJsonKafkaSourceResetStrategy.

// test case with kafka offset reset strategy
@Test
public void testJsonKafkaSourceResetStrategy() {
    // topic setup.
    final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceResetStrategy";
    testUtils.createTopic(topic, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    TypedProperties earliestProps = createPropsForJsonSource(topic, null, "earliest");
    Source earliestJsonSource = new JsonKafkaSource(earliestProps, jsc(), spark(), schemaProvider, metrics);
    SourceFormatAdapter earliestKafkaSource = new SourceFormatAdapter(earliestJsonSource);
    TypedProperties latestProps = createPropsForJsonSource(topic, null, "latest");
    Source latestJsonSource = new JsonKafkaSource(latestProps, jsc(), spark(), schemaProvider, metrics);
    SourceFormatAdapter latestKafkaSource = new SourceFormatAdapter(latestJsonSource);
    // 1. Extract with a none data kafka checkpoint
    // => get a checkpoint string like "hoodie_test,0:0,1:0", latest checkpoint should be equals to earliest checkpoint
    InputBatch<JavaRDD<GenericRecord>> earFetch0 = earliestKafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
    InputBatch<JavaRDD<GenericRecord>> latFetch0 = latestKafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
    assertEquals(earFetch0.getBatch(), latFetch0.getBatch());
    assertEquals(earFetch0.getCheckpointForNextBatch(), latFetch0.getCheckpointForNextBatch());
    testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
    // 2. Extract new checkpoint with a null / empty string pre checkpoint
    // => earliest fetch with max source limit will get all of data and a end offset checkpoint
    InputBatch<JavaRDD<GenericRecord>> earFetch1 = earliestKafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
    // => [a null pre checkpoint] latest reset fetch will get a end offset checkpoint same to earliest
    InputBatch<JavaRDD<GenericRecord>> latFetch1 = latestKafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
    assertEquals(earFetch1.getCheckpointForNextBatch(), latFetch1.getCheckpointForNextBatch());
}
Also used : TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) JavaRDD(org.apache.spark.api.java.JavaRDD) Test(org.junit.jupiter.api.Test)

Example 30 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestJsonKafkaSource method testJsonKafkaSource.

@Test
public void testJsonKafkaSource() {
    // topic setup.
    final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSource";
    testUtils.createTopic(topic, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
    Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
    SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
    // 1. Extract without any checkpoint => get all the data, respecting sourceLimit
    assertEquals(Option.empty(), kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch());
    testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
    InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
    assertEquals(900, fetch1.getBatch().get().count());
    // Test Avro To DataFrame<Row> path
    Dataset<Row> fetch1AsRows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()), schemaProvider.getSourceSchema().toString(), jsonSource.getSparkSession());
    assertEquals(900, fetch1AsRows.count());
    // 2. Produce new data, extract new data
    testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 1000)));
    InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(1100, fetch2.getBatch().get().count());
    // 3. Extract with previous checkpoint => gives same data back (idempotent)
    InputBatch<JavaRDD<GenericRecord>> fetch3 = kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(fetch2.getBatch().get().count(), fetch3.getBatch().get().count());
    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch());
    // Same using Row API
    InputBatch<Dataset<Row>> fetch3AsRows = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(fetch2.getBatch().get().count(), fetch3AsRows.getBatch().get().count());
    assertEquals(fetch2.getCheckpointForNextBatch(), fetch3AsRows.getCheckpointForNextBatch());
    // 4. Extract with latest checkpoint => no new data returned
    InputBatch<JavaRDD<GenericRecord>> fetch4 = kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(Option.empty(), fetch4.getBatch());
    // Same using Row API
    InputBatch<Dataset<Row>> fetch4AsRows = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(Option.empty(), fetch4AsRows.getBatch());
}
Also used : Dataset(org.apache.spark.sql.Dataset) Row(org.apache.spark.sql.Row) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) JavaRDD(org.apache.spark.api.java.JavaRDD) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)97 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)52 Test (org.junit.jupiter.api.Test)51 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)44 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)38 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)31 TypedProperties (org.apache.hudi.common.config.TypedProperties)29 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 GenericRecord (org.apache.avro.generic.GenericRecord)25 JavaRDD (org.apache.spark.api.java.JavaRDD)25 Path (org.apache.hadoop.fs.Path)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 ArrayList (java.util.ArrayList)21 Properties (java.util.Properties)21 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)18 HoodieTable (org.apache.hudi.table.HoodieTable)18 List (java.util.List)17 ValueSource (org.junit.jupiter.params.provider.ValueSource)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)16 IOException (java.io.IOException)15