use of org.apache.hudi.sink.transform.Transformer in project hudi by apache.
the class HoodieFlinkStreamer method main.
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
final FlinkStreamerConfig cfg = new FlinkStreamerConfig();
JCommander cmd = new JCommander(cfg, null, args);
if (cfg.help || args.length == 0) {
cmd.usage();
System.exit(1);
}
env.enableCheckpointing(cfg.checkpointInterval);
env.getConfig().setGlobalJobParameters(cfg);
// We use checkpoint to trigger write operation, including instant generating and committing,
// There can only be one checkpoint at one time.
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.setStateBackend(cfg.stateBackend);
if (cfg.flinkCheckPointPath != null) {
env.getCheckpointConfig().setCheckpointStorage(cfg.flinkCheckPointPath);
}
TypedProperties kafkaProps = DFSPropertiesConfiguration.getGlobalProps();
kafkaProps.putAll(StreamerUtil.appendKafkaProps(cfg));
// Read from kafka source
RowType rowType = (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(cfg)).getLogicalType();
Configuration conf = FlinkStreamerConfig.toFlinkConfig(cfg);
long ckpTimeout = env.getCheckpointConfig().getCheckpointTimeout();
int parallelism = env.getParallelism();
conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout);
DataStream<RowData> dataStream = env.addSource(new FlinkKafkaConsumer<>(cfg.kafkaTopic, new JsonRowDataDeserializationSchema(rowType, InternalTypeInfo.of(rowType), false, true, TimestampFormat.ISO_8601), kafkaProps)).name("kafka_source").uid("uid_kafka_source");
if (cfg.transformerClassNames != null && !cfg.transformerClassNames.isEmpty()) {
Option<Transformer> transformer = StreamerUtil.createTransformer(cfg.transformerClassNames);
if (transformer.isPresent()) {
dataStream = transformer.get().apply(dataStream);
}
}
DataStream<HoodieRecord> hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, parallelism, dataStream);
DataStream<Object> pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, hoodieRecordDataStream);
if (StreamerUtil.needsAsyncCompaction(conf)) {
Pipelines.compact(conf, pipeline);
} else {
Pipelines.clean(conf, pipeline);
}
env.execute(cfg.targetTableName);
}
use of org.apache.hudi.sink.transform.Transformer in project hudi by apache.
the class ITTestDataStreamWrite method testChainedTransformersBeforeWriting.
@Test
public void testChainedTransformersBeforeWriting() throws Exception {
Transformer t1 = (ds) -> ds.map((rowdata) -> {
if (rowdata instanceof GenericRowData) {
GenericRowData genericRD = (GenericRowData) rowdata;
// update age field to age + 1
genericRD.setField(2, genericRD.getInt(2) + 1);
return genericRD;
} else {
throw new RuntimeException("Unrecognized row type : " + rowdata.getClass().getSimpleName());
}
});
ChainedTransformer chainedTransformer = new ChainedTransformer(Arrays.asList(t1, t1));
testWriteToHoodie(chainedTransformer, EXPECTED_CHAINED_TRANSFORMER);
}
use of org.apache.hudi.sink.transform.Transformer in project hudi by apache.
the class ITTestDataStreamWrite method testTransformerBeforeWriting.
@Test
public void testTransformerBeforeWriting() throws Exception {
Transformer transformer = (ds) -> ds.map((rowdata) -> {
if (rowdata instanceof GenericRowData) {
GenericRowData genericRD = (GenericRowData) rowdata;
// update age field to age + 1
genericRD.setField(2, genericRD.getInt(2) + 1);
return genericRD;
} else {
throw new RuntimeException("Unrecognized row type information: " + rowdata.getClass().getSimpleName());
}
});
testWriteToHoodie(transformer, EXPECTED_TRANSFORMER);
}
use of org.apache.hudi.sink.transform.Transformer in project hudi by apache.
the class ITTestDataStreamWrite method testWriteToHoodie.
private void testWriteToHoodie(Transformer transformer, Map<String, List<String>> expected) throws Exception {
Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath());
StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment();
execEnv.getConfig().disableObjectReuse();
execEnv.setParallelism(4);
// set up checkpoint interval
execEnv.enableCheckpointing(4000, CheckpointingMode.EXACTLY_ONCE);
execEnv.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
// Read from file source
RowType rowType = (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf)).getLogicalType();
JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema(rowType, InternalTypeInfo.of(rowType), false, true, TimestampFormat.ISO_8601);
String sourcePath = Objects.requireNonNull(Thread.currentThread().getContextClassLoader().getResource("test_source.data")).toString();
DataStream<RowData> dataStream = execEnv.addSource(new ContinuousFileSource.BoundedSourceFunction(new Path(sourcePath), 2)).name("continuous_file_source").setParallelism(1).map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8))).setParallelism(4);
if (transformer != null) {
dataStream = transformer.apply(dataStream);
}
int parallelism = execEnv.getParallelism();
DataStream<HoodieRecord> hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, parallelism, dataStream);
DataStream<Object> pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, hoodieRecordDataStream);
execEnv.addOperator(pipeline.getTransformation());
JobClient client = execEnv.executeAsync(conf.getString(FlinkOptions.TABLE_NAME));
// wait for the streaming job to finish
client.getJobExecutionResult().get();
TestData.checkWrittenFullData(tempFile, expected);
}
Aggregations