use of org.apache.iceberg.flink.TableLoader in project iceberg by apache.
the class TestFlinkIcebergSink method testTwoSinksInDisjointedDAG.
@Test
public void testTwoSinksInDisjointedDAG() throws Exception {
Map<String, String> props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name());
String leftTablePath = TEMPORARY_FOLDER.newFolder().getAbsolutePath().concat("/left");
Assert.assertTrue("Should create the table path correctly.", new File(leftTablePath).mkdir());
Table leftTable = SimpleDataUtil.createTable(leftTablePath, props, partitioned);
TableLoader leftTableLoader = TableLoader.fromHadoopTable(leftTablePath);
String rightTablePath = TEMPORARY_FOLDER.newFolder().getAbsolutePath().concat("/right");
Assert.assertTrue("Should create the table path correctly.", new File(rightTablePath).mkdir());
Table rightTable = SimpleDataUtil.createTable(rightTablePath, props, partitioned);
TableLoader rightTableLoader = TableLoader.fromHadoopTable(rightTablePath);
env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG).enableCheckpointing(100).setParallelism(parallelism).setMaxParallelism(parallelism);
env.getConfig().disableAutoGeneratedUIDs();
List<Row> leftRows = createRows("left-");
DataStream<Row> leftStream = env.fromCollection(leftRows, ROW_TYPE_INFO).name("leftCustomSource").uid("leftCustomSource");
FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA).table(leftTable).tableLoader(leftTableLoader).tableSchema(SimpleDataUtil.FLINK_SCHEMA).distributionMode(DistributionMode.NONE).uidPrefix("leftIcebergSink").append();
List<Row> rightRows = createRows("right-");
DataStream<Row> rightStream = env.fromCollection(rightRows, ROW_TYPE_INFO).name("rightCustomSource").uid("rightCustomSource");
FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA).table(rightTable).tableLoader(rightTableLoader).tableSchema(SimpleDataUtil.FLINK_SCHEMA).writeParallelism(parallelism).distributionMode(DistributionMode.HASH).uidPrefix("rightIcebergSink").setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()).setSnapshotProperties(Collections.singletonMap("direction", "rightTable")).append();
// Execute the program.
env.execute("Test Iceberg DataStream.");
SimpleDataUtil.assertTableRows(leftTablePath, convertToRowData(leftRows));
SimpleDataUtil.assertTableRows(rightTablePath, convertToRowData(rightRows));
leftTable.refresh();
Assert.assertNull(leftTable.currentSnapshot().summary().get("flink.test"));
Assert.assertNull(leftTable.currentSnapshot().summary().get("direction"));
rightTable.refresh();
Assert.assertEquals(TestFlinkIcebergSink.class.getName(), rightTable.currentSnapshot().summary().get("flink.test"));
Assert.assertEquals("rightTable", rightTable.currentSnapshot().summary().get("direction"));
}
use of org.apache.iceberg.flink.TableLoader in project iceberg by apache.
the class TestFlinkScanSql method testInferedParallelism.
@Test
public void testInferedParallelism() throws IOException {
Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC);
TableLoader tableLoader = TableLoader.fromHadoopTable(table.location());
FlinkInputFormat flinkInputFormat = FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat();
ScanContext scanContext = ScanContext.builder().build();
// Empty table, infer parallelism should be at least 1
int parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext);
Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER);
DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L));
DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L));
helper.appendToTable(dataFile1, dataFile2);
// Make sure to generate 2 CombinedScanTasks
long maxFileLen = Math.max(dataFile1.fileSizeInBytes(), dataFile2.fileSizeInBytes());
sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen);
// 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits num : 2
parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext);
Assert.assertEquals("Should produce the expected parallelism.", 2, parallelism);
// 2 splits and limit is 1 , max infer parallelism is default 100,
// which is greater than splits num and limit, the parallelism is the limit value : 1
parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build());
Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
// 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is 1
Configuration configuration = new Configuration();
configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1);
parallelism = FlinkSource.forRowData().flinkConf(configuration).inferParallelism(flinkInputFormat, ScanContext.builder().build());
Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
// 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : 1
parallelism = FlinkSource.forRowData().flinkConf(configuration).inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build());
Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
// 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1
configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false);
parallelism = FlinkSource.forRowData().flinkConf(configuration).inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build());
Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
}
use of org.apache.iceberg.flink.TableLoader in project incubator-inlong by apache.
the class Entrance method buildSinkStream.
private static void buildSinkStream(DataStream<Row> sourceStream, Configuration config, SinkInfo sinkInfo, Map<String, Object> properties, long dataflowId) throws IOException, ClassNotFoundException {
final String sinkType = checkNotNull(config.getString(Constants.SINK_TYPE));
final int sinkParallelism = config.getInteger(Constants.SINK_PARALLELISM);
switch(sinkType) {
case Constants.SINK_TYPE_CLICKHOUSE:
checkState(sinkInfo instanceof ClickHouseSinkInfo);
ClickHouseSinkInfo clickHouseSinkInfo = (ClickHouseSinkInfo) sinkInfo;
sourceStream.addSink(new ClickhouseRowSinkFunction(clickHouseSinkInfo)).uid(Constants.SINK_UID).name("Clickhouse Sink").setParallelism(sinkParallelism);
break;
case Constants.SINK_TYPE_HIVE:
checkState(sinkInfo instanceof HiveSinkInfo);
HiveSinkInfo hiveSinkInfo = (HiveSinkInfo) sinkInfo;
if (hiveSinkInfo.getPartitions().length == 0) {
// The committer operator is not necessary if partition is not existent.
sourceStream.process(new HiveWriter(config, dataflowId, hiveSinkInfo)).uid(Constants.SINK_UID).name("Hive Sink").setParallelism(sinkParallelism);
} else {
sourceStream.process(new HiveWriter(config, dataflowId, hiveSinkInfo)).uid(Constants.SINK_UID).name("Hive Sink").setParallelism(sinkParallelism).addSink(new HiveCommitter(config, hiveSinkInfo)).name("Hive Committer").setParallelism(1);
}
break;
case Constants.SINK_TYPE_ICEBERG:
checkState(sinkInfo instanceof IcebergSinkInfo);
IcebergSinkInfo icebergSinkInfo = (IcebergSinkInfo) sinkInfo;
TableLoader tableLoader = TableLoader.fromHadoopTable(icebergSinkInfo.getTableLocation(), new org.apache.hadoop.conf.Configuration());
FlinkSink.forRow(sourceStream, CommonUtils.getTableSchema(sinkInfo.getFields())).tableLoader(tableLoader).writeParallelism(sinkParallelism).build();
break;
case Constants.SINK_TYPE_KAFKA:
checkState(sinkInfo instanceof KafkaSinkInfo);
SerializationSchema<Row> schema = SerializationSchemaFactory.build(sinkInfo.getFields(), ((KafkaSinkInfo) sinkInfo).getSerializationInfo());
sourceStream.addSink(buildKafkaSink((KafkaSinkInfo) sinkInfo, properties, schema, config)).uid(Constants.SINK_UID).name("Kafka Sink").setParallelism(sinkParallelism);
break;
default:
throw new IllegalArgumentException("Unsupported sink type " + sinkType);
}
}
use of org.apache.iceberg.flink.TableLoader in project LarkMidTable by birdLark.
the class FlinkIceBergStream method appendingData.
public static void appendingData(StreamExecutionEnvironment env, TableLoader tableLoader) {
DataStream<RowData> batch = FlinkSource.forRowData().env(env).tableLoader(tableLoader).streaming(false).build();
TableLoader table2 = TableLoader.fromHadoopTable("");
FlinkSink.forRowData(batch).tableLoader(table2).build();
}
use of org.apache.iceberg.flink.TableLoader in project LarkMidTable by birdLark.
the class FlinkIceBergStream method main.
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
TableLoader tableLoader = TableLoader.fromHadoopTable("");
appendingData(env, tableLoader);
env.execute();
}
Aggregations