Search in sources :

Example 1 with ParameterTool

use of org.apache.inlong.sort.util.ParameterTool in project incubator-inlong by apache.

the class Entrance method main.

/**
 * Entrance of a flink job.
 */
public static void main(String[] args) throws Exception {
    final ParameterTool parameter = ParameterTool.fromArgs(args);
    final Configuration config = parameter.getConfiguration();
    final String clusterId = checkNotNull(config.getString(Constants.CLUSTER_ID));
    final String sourceType = checkNotNull(config.getString(Constants.SOURCE_TYPE));
    final String sinkType = checkNotNull(config.getString(Constants.SINK_TYPE));
    final int sourceParallelism = config.getInteger(Constants.SOURCE_PARALLELISM);
    final int deserializationParallelism = config.getInteger(Constants.DESERIALIZATION_PARALLELISM);
    final int sinkParallelism = config.getInteger(Constants.SINK_PARALLELISM);
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    env.enableCheckpointing(config.getInteger(Constants.CHECKPOINT_INTERVAL_MS));
    env.getCheckpointConfig().setMinPauseBetweenCheckpoints(config.getInteger(Constants.MIN_PAUSE_BETWEEN_CHECKPOINTS_MS));
    env.getCheckpointConfig().setCheckpointTimeout(config.getInteger(Constants.CHECKPOINT_TIMEOUT_MS));
    env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
    // Data stream
    DataStream<SerializedRecord> sourceStream;
    if (sourceType.equals(Constants.SOURCE_TYPE_TUBE)) {
        sourceStream = env.addSource(new MultiTopicTubeSourceFunction(config)).setParallelism(sourceParallelism).uid(Constants.SOURCE_UID).name("TubeMQ source").rebalance();
    } else if (sourceType.equals(Constants.SOURCE_TYPE_PULSAR)) {
        sourceStream = env.addSource(new MultiTopicPulsarSourceFunction(config)).setParallelism(sourceParallelism).uid(Constants.SOURCE_UID).name("Pulsar source").rebalance();
    } else {
        throw new IllegalArgumentException("Unsupported source type " + sourceType);
    }
    final SingleOutputStreamOperator<SerializedRecord> deserializationStream = sourceStream.process(new DeserializationSchema(config)).setParallelism(deserializationParallelism).uid(Constants.DESERIALIZATION_SCHEMA_UID).name("Deserialization");
    if (sinkType.equals(Constants.SINK_TYPE_CLICKHOUSE)) {
        deserializationStream.process(new ClickHouseMultiSinkFunction(config)).setParallelism(sinkParallelism).uid(Constants.SINK_UID).name("Clickhouse Sink");
    } else if (sinkType.equals(SINK_TYPE_DORIS)) {
        deserializationStream.process(new DorisMultiSinkFunction(config)).uid(Constants.SINK_UID).name("Doris Sink").setParallelism(sinkParallelism);
    } else if (sinkType.equals(SINK_TYPE_HIVE)) {
        deserializationStream.process(new HiveMultiTenantWriter(config)).name("Hive Sink").uid(Constants.SINK_UID).setParallelism(sinkParallelism).process(new HiveMultiTenantCommitter(config)).name("hive Committer").setParallelism(config.getInteger(Constants.COMMITTER_PARALLELISM));
    } else {
        throw new IllegalArgumentException("Unsupported sink type " + sinkType);
    }
    // Metric stream
    final boolean enableOutputMetrics = config.getBoolean(Constants.METRICS_ENABLE_OUTPUT);
    if (enableOutputMetrics) {
        final int metricsAggregatorParallelism = config.getInteger(Constants.METRICS_AGGREGATOR_PARALLELISM);
        final int metricsTimestampWatermarkAssignerParallelism = config.getInteger(Constants.METRICS_TIMESTAMP_WATERMARK_ASSIGNER_PARALLELISM);
        final int metricsMySQLSinkParallelism = config.getInteger(Constants.METRICS_SINK_PARALLELISM);
        final OutputTag<MetricData> outputTag = new OutputTag<MetricData>(Constants.METRIC_DATA_OUTPUT_TAG_ID) {
        };
        final DataStream<MetricData> metricsDataStream = deserializationStream.getSideOutput(outputTag).assignTimestampsAndWatermarks(new MetricsAssignerWithPeriodicWatermarks()).setParallelism(metricsTimestampWatermarkAssignerParallelism).uid(Constants.METRICS_TIMESTAMP_AND_WATERMARK_ASSIGNER_UID).name("Metrics timestamp/watermark assigner");
        final DataStream<MetricData> metricsAggregatorStream = metricsDataStream.keyBy((KeySelector<MetricData, String>) MetricData::getKey).window(TumblingEventTimeWindows.of(Time.minutes(config.getInteger(Constants.METRICS_AGGREGATOR_WINDOW_SIZE)))).allowedLateness(Time.milliseconds(Long.MAX_VALUE)).aggregate(new MetricsAggregateFunction(), new MetricsProcessWindowFunction()).setParallelism(metricsAggregatorParallelism).uid(Constants.METRICS_AGGREGATOR_UID).name("Metrics aggregator");
        metricsAggregatorStream.addSink(new MetricsLogSink()).setParallelism(metricsMySQLSinkParallelism).uid(Constants.METRICS_SINK_UID).name("Metrics sink");
    }
    env.execute(clusterId);
}
Also used : ParameterTool(org.apache.inlong.sort.util.ParameterTool) MetricsAssignerWithPeriodicWatermarks(org.apache.inlong.sort.flink.metrics.MetricsAssignerWithPeriodicWatermarks) Configuration(org.apache.inlong.sort.configuration.Configuration) MetricsLogSink(org.apache.inlong.sort.flink.metrics.MetricsLogSink) DeserializationSchema(org.apache.inlong.sort.flink.deserialization.DeserializationSchema) MetricsProcessWindowFunction(org.apache.inlong.sort.flink.metrics.MetricsAggregator.MetricsProcessWindowFunction) DorisMultiSinkFunction(org.apache.inlong.sort.flink.multitenant.doris.DorisMultiSinkFunction) HiveMultiTenantCommitter(org.apache.inlong.sort.flink.multitenant.hive.HiveMultiTenantCommitter) OutputTag(org.apache.flink.util.OutputTag) MultiTopicTubeSourceFunction(org.apache.inlong.sort.flink.multitenant.tubemq.MultiTopicTubeSourceFunction) HiveMultiTenantWriter(org.apache.inlong.sort.flink.multitenant.hive.HiveMultiTenantWriter) MultiTopicPulsarSourceFunction(org.apache.inlong.sort.flink.multitenant.pulsar.MultiTopicPulsarSourceFunction) MetricsAggregateFunction(org.apache.inlong.sort.flink.metrics.MetricsAggregator.MetricsAggregateFunction) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) ClickHouseMultiSinkFunction(org.apache.inlong.sort.flink.multitenant.clickhouse.ClickHouseMultiSinkFunction) MetricData(org.apache.inlong.sort.flink.metrics.MetricData)

Example 2 with ParameterTool

use of org.apache.inlong.sort.util.ParameterTool in project incubator-inlong by apache.

the class Entrance method main.

public static void main(String[] args) throws Exception {
    final ParameterTool parameterTool = ParameterTool.fromArgs(args);
    final Configuration config = parameterTool.getConfiguration();
    final String clusterId = checkNotNull(config.getString(Constants.CLUSTER_ID));
    final DataFlowInfo dataFlowInfo = getDataflowInfoFromFile(config.getString(Constants.DATAFLOW_INFO_FILE));
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // Checkpoint related
    env.enableCheckpointing(config.getInteger(Constants.CHECKPOINT_INTERVAL_MS));
    env.getCheckpointConfig().setMinPauseBetweenCheckpoints(config.getInteger(Constants.MIN_PAUSE_BETWEEN_CHECKPOINTS_MS));
    env.getCheckpointConfig().setCheckpointTimeout(config.getInteger(Constants.CHECKPOINT_TIMEOUT_MS));
    env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
    DataStream<SerializedRecord> sourceStream = buildSourceStream(env, config, dataFlowInfo.getSourceInfo(), dataFlowInfo.getProperties());
    DataStream<Row> deserializedStream = buildDeserializationStream(sourceStream, dataFlowInfo.getSourceInfo(), config);
    DataStream<Row> transformationStream = buildTransformationStream(deserializedStream, dataFlowInfo, config);
    buildSinkStream(transformationStream, config, dataFlowInfo.getSinkInfo(), dataFlowInfo.getProperties(), dataFlowInfo.getId());
    env.execute(clusterId);
}
Also used : ParameterTool(org.apache.inlong.sort.util.ParameterTool) Configuration(org.apache.inlong.sort.configuration.Configuration) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Row(org.apache.flink.types.Row) DataFlowInfo(org.apache.inlong.sort.protocol.DataFlowInfo)

Aggregations

StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)2 Configuration (org.apache.inlong.sort.configuration.Configuration)2 ParameterTool (org.apache.inlong.sort.util.ParameterTool)2 Row (org.apache.flink.types.Row)1 OutputTag (org.apache.flink.util.OutputTag)1 DeserializationSchema (org.apache.inlong.sort.flink.deserialization.DeserializationSchema)1 MetricData (org.apache.inlong.sort.flink.metrics.MetricData)1 MetricsAggregateFunction (org.apache.inlong.sort.flink.metrics.MetricsAggregator.MetricsAggregateFunction)1 MetricsProcessWindowFunction (org.apache.inlong.sort.flink.metrics.MetricsAggregator.MetricsProcessWindowFunction)1 MetricsAssignerWithPeriodicWatermarks (org.apache.inlong.sort.flink.metrics.MetricsAssignerWithPeriodicWatermarks)1 MetricsLogSink (org.apache.inlong.sort.flink.metrics.MetricsLogSink)1 ClickHouseMultiSinkFunction (org.apache.inlong.sort.flink.multitenant.clickhouse.ClickHouseMultiSinkFunction)1 DorisMultiSinkFunction (org.apache.inlong.sort.flink.multitenant.doris.DorisMultiSinkFunction)1 HiveMultiTenantCommitter (org.apache.inlong.sort.flink.multitenant.hive.HiveMultiTenantCommitter)1 HiveMultiTenantWriter (org.apache.inlong.sort.flink.multitenant.hive.HiveMultiTenantWriter)1 MultiTopicPulsarSourceFunction (org.apache.inlong.sort.flink.multitenant.pulsar.MultiTopicPulsarSourceFunction)1 MultiTopicTubeSourceFunction (org.apache.inlong.sort.flink.multitenant.tubemq.MultiTopicTubeSourceFunction)1 DataFlowInfo (org.apache.inlong.sort.protocol.DataFlowInfo)1