Search in sources :

Example 1 with TableLoader

use of org.apache.iceberg.flink.TableLoader in project iceberg by apache.

the class TestFlinkIcebergSink method testTwoSinksInDisjointedDAG.

@Test
public void testTwoSinksInDisjointedDAG() throws Exception {
    Map<String, String> props = ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name());
    String leftTablePath = TEMPORARY_FOLDER.newFolder().getAbsolutePath().concat("/left");
    Assert.assertTrue("Should create the table path correctly.", new File(leftTablePath).mkdir());
    Table leftTable = SimpleDataUtil.createTable(leftTablePath, props, partitioned);
    TableLoader leftTableLoader = TableLoader.fromHadoopTable(leftTablePath);
    String rightTablePath = TEMPORARY_FOLDER.newFolder().getAbsolutePath().concat("/right");
    Assert.assertTrue("Should create the table path correctly.", new File(rightTablePath).mkdir());
    Table rightTable = SimpleDataUtil.createTable(rightTablePath, props, partitioned);
    TableLoader rightTableLoader = TableLoader.fromHadoopTable(rightTablePath);
    env = StreamExecutionEnvironment.getExecutionEnvironment(MiniClusterResource.DISABLE_CLASSLOADER_CHECK_CONFIG).enableCheckpointing(100).setParallelism(parallelism).setMaxParallelism(parallelism);
    env.getConfig().disableAutoGeneratedUIDs();
    List<Row> leftRows = createRows("left-");
    DataStream<Row> leftStream = env.fromCollection(leftRows, ROW_TYPE_INFO).name("leftCustomSource").uid("leftCustomSource");
    FlinkSink.forRow(leftStream, SimpleDataUtil.FLINK_SCHEMA).table(leftTable).tableLoader(leftTableLoader).tableSchema(SimpleDataUtil.FLINK_SCHEMA).distributionMode(DistributionMode.NONE).uidPrefix("leftIcebergSink").append();
    List<Row> rightRows = createRows("right-");
    DataStream<Row> rightStream = env.fromCollection(rightRows, ROW_TYPE_INFO).name("rightCustomSource").uid("rightCustomSource");
    FlinkSink.forRow(rightStream, SimpleDataUtil.FLINK_SCHEMA).table(rightTable).tableLoader(rightTableLoader).tableSchema(SimpleDataUtil.FLINK_SCHEMA).writeParallelism(parallelism).distributionMode(DistributionMode.HASH).uidPrefix("rightIcebergSink").setSnapshotProperty("flink.test", TestFlinkIcebergSink.class.getName()).setSnapshotProperties(Collections.singletonMap("direction", "rightTable")).append();
    // Execute the program.
    env.execute("Test Iceberg DataStream.");
    SimpleDataUtil.assertTableRows(leftTablePath, convertToRowData(leftRows));
    SimpleDataUtil.assertTableRows(rightTablePath, convertToRowData(rightRows));
    leftTable.refresh();
    Assert.assertNull(leftTable.currentSnapshot().summary().get("flink.test"));
    Assert.assertNull(leftTable.currentSnapshot().summary().get("direction"));
    rightTable.refresh();
    Assert.assertEquals(TestFlinkIcebergSink.class.getName(), rightTable.currentSnapshot().summary().get("flink.test"));
    Assert.assertEquals("rightTable", rightTable.currentSnapshot().summary().get("direction"));
}
Also used : Table(org.apache.iceberg.Table) Row(org.apache.flink.types.Row) File(java.io.File) TableLoader(org.apache.iceberg.flink.TableLoader) Test(org.junit.Test)

Example 2 with TableLoader

use of org.apache.iceberg.flink.TableLoader in project iceberg by apache.

the class TestFlinkScanSql method testInferedParallelism.

@Test
public void testInferedParallelism() throws IOException {
    Table table = catalog.createTable(TableIdentifier.of("default", "t"), TestFixtures.SCHEMA, TestFixtures.SPEC);
    TableLoader tableLoader = TableLoader.fromHadoopTable(table.location());
    FlinkInputFormat flinkInputFormat = FlinkSource.forRowData().tableLoader(tableLoader).table(table).buildFormat();
    ScanContext scanContext = ScanContext.builder().build();
    // Empty table, infer parallelism should be at least 1
    int parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext);
    Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
    GenericAppenderHelper helper = new GenericAppenderHelper(table, fileFormat, TEMPORARY_FOLDER);
    DataFile dataFile1 = helper.writeFile(TestHelpers.Row.of("2020-03-20", 0), RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L));
    DataFile dataFile2 = helper.writeFile(TestHelpers.Row.of("2020-03-21", 0), RandomGenericData.generate(TestFixtures.SCHEMA, 2, 0L));
    helper.appendToTable(dataFile1, dataFile2);
    // Make sure to generate 2 CombinedScanTasks
    long maxFileLen = Math.max(dataFile1.fileSizeInBytes(), dataFile2.fileSizeInBytes());
    sql("ALTER TABLE t SET ('read.split.open-file-cost'='1', 'read.split.target-size'='%s')", maxFileLen);
    // 2 splits (max infer is the default value 100 , max > splits num), the parallelism is splits num : 2
    parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, scanContext);
    Assert.assertEquals("Should produce the expected parallelism.", 2, parallelism);
    // 2 splits and limit is 1 , max infer parallelism is default 100,
    // which is greater than splits num and limit, the parallelism is the limit value : 1
    parallelism = FlinkSource.forRowData().inferParallelism(flinkInputFormat, ScanContext.builder().limit(1).build());
    Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
    // 2 splits and max infer parallelism is 1 (max < splits num), the parallelism is  1
    Configuration configuration = new Configuration();
    configuration.setInteger(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM_MAX, 1);
    parallelism = FlinkSource.forRowData().flinkConf(configuration).inferParallelism(flinkInputFormat, ScanContext.builder().build());
    Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
    // 2 splits, max infer parallelism is 1, limit is 3, the parallelism is max infer parallelism : 1
    parallelism = FlinkSource.forRowData().flinkConf(configuration).inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build());
    Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
    // 2 splits, infer parallelism is disabled, the parallelism is flink default parallelism 1
    configuration.setBoolean(FlinkConfigOptions.TABLE_EXEC_ICEBERG_INFER_SOURCE_PARALLELISM, false);
    parallelism = FlinkSource.forRowData().flinkConf(configuration).inferParallelism(flinkInputFormat, ScanContext.builder().limit(3).build());
    Assert.assertEquals("Should produce the expected parallelism.", 1, parallelism);
}
Also used : DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) GenericAppenderHelper(org.apache.iceberg.data.GenericAppenderHelper) Configuration(org.apache.flink.configuration.Configuration) TableLoader(org.apache.iceberg.flink.TableLoader) Test(org.junit.Test)

Example 3 with TableLoader

use of org.apache.iceberg.flink.TableLoader in project incubator-inlong by apache.

the class Entrance method buildSinkStream.

private static void buildSinkStream(DataStream<Row> sourceStream, Configuration config, SinkInfo sinkInfo, Map<String, Object> properties, long dataflowId) throws IOException, ClassNotFoundException {
    final String sinkType = checkNotNull(config.getString(Constants.SINK_TYPE));
    final int sinkParallelism = config.getInteger(Constants.SINK_PARALLELISM);
    switch(sinkType) {
        case Constants.SINK_TYPE_CLICKHOUSE:
            checkState(sinkInfo instanceof ClickHouseSinkInfo);
            ClickHouseSinkInfo clickHouseSinkInfo = (ClickHouseSinkInfo) sinkInfo;
            sourceStream.addSink(new ClickhouseRowSinkFunction(clickHouseSinkInfo)).uid(Constants.SINK_UID).name("Clickhouse Sink").setParallelism(sinkParallelism);
            break;
        case Constants.SINK_TYPE_HIVE:
            checkState(sinkInfo instanceof HiveSinkInfo);
            HiveSinkInfo hiveSinkInfo = (HiveSinkInfo) sinkInfo;
            if (hiveSinkInfo.getPartitions().length == 0) {
                // The committer operator is not necessary if partition is not existent.
                sourceStream.process(new HiveWriter(config, dataflowId, hiveSinkInfo)).uid(Constants.SINK_UID).name("Hive Sink").setParallelism(sinkParallelism);
            } else {
                sourceStream.process(new HiveWriter(config, dataflowId, hiveSinkInfo)).uid(Constants.SINK_UID).name("Hive Sink").setParallelism(sinkParallelism).addSink(new HiveCommitter(config, hiveSinkInfo)).name("Hive Committer").setParallelism(1);
            }
            break;
        case Constants.SINK_TYPE_ICEBERG:
            checkState(sinkInfo instanceof IcebergSinkInfo);
            IcebergSinkInfo icebergSinkInfo = (IcebergSinkInfo) sinkInfo;
            TableLoader tableLoader = TableLoader.fromHadoopTable(icebergSinkInfo.getTableLocation(), new org.apache.hadoop.conf.Configuration());
            FlinkSink.forRow(sourceStream, CommonUtils.getTableSchema(sinkInfo.getFields())).tableLoader(tableLoader).writeParallelism(sinkParallelism).build();
            break;
        case Constants.SINK_TYPE_KAFKA:
            checkState(sinkInfo instanceof KafkaSinkInfo);
            SerializationSchema<Row> schema = SerializationSchemaFactory.build(sinkInfo.getFields(), ((KafkaSinkInfo) sinkInfo).getSerializationInfo());
            sourceStream.addSink(buildKafkaSink((KafkaSinkInfo) sinkInfo, properties, schema, config)).uid(Constants.SINK_UID).name("Kafka Sink").setParallelism(sinkParallelism);
            break;
        default:
            throw new IllegalArgumentException("Unsupported sink type " + sinkType);
    }
}
Also used : HiveWriter(org.apache.inlong.sort.flink.hive.HiveWriter) IcebergSinkInfo(org.apache.inlong.sort.protocol.sink.IcebergSinkInfo) ClickhouseRowSinkFunction(org.apache.inlong.sort.singletenant.flink.clickhouse.ClickhouseRowSinkFunction) ClickHouseSinkInfo(org.apache.inlong.sort.protocol.sink.ClickHouseSinkInfo) HiveCommitter(org.apache.inlong.sort.flink.hive.HiveCommitter) HiveSinkInfo(org.apache.inlong.sort.protocol.sink.HiveSinkInfo) KafkaSinkInfo(org.apache.inlong.sort.protocol.sink.KafkaSinkInfo) Row(org.apache.flink.types.Row) TableLoader(org.apache.iceberg.flink.TableLoader)

Example 4 with TableLoader

use of org.apache.iceberg.flink.TableLoader in project LarkMidTable by birdLark.

the class FlinkIceBergStream method appendingData.

public static void appendingData(StreamExecutionEnvironment env, TableLoader tableLoader) {
    DataStream<RowData> batch = FlinkSource.forRowData().env(env).tableLoader(tableLoader).streaming(false).build();
    TableLoader table2 = TableLoader.fromHadoopTable("");
    FlinkSink.forRowData(batch).tableLoader(table2).build();
}
Also used : RowData(org.apache.flink.table.data.RowData) TableLoader(org.apache.iceberg.flink.TableLoader)

Example 5 with TableLoader

use of org.apache.iceberg.flink.TableLoader in project LarkMidTable by birdLark.

the class FlinkIceBergStream method main.

public static void main(String[] args) throws Exception {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    TableLoader tableLoader = TableLoader.fromHadoopTable("");
    appendingData(env, tableLoader);
    env.execute();
}
Also used : StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) TableLoader(org.apache.iceberg.flink.TableLoader)

Aggregations

TableLoader (org.apache.iceberg.flink.TableLoader)7 Table (org.apache.iceberg.Table)4 Row (org.apache.flink.types.Row)3 Test (org.junit.Test)3 Configuration (org.apache.flink.configuration.Configuration)2 DataFile (org.apache.iceberg.DataFile)2 GenericAppenderHelper (org.apache.iceberg.data.GenericAppenderHelper)2 File (java.io.File)1 ExecutorService (java.util.concurrent.ExecutorService)1 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)1 RowData (org.apache.flink.table.data.RowData)1 Record (org.apache.iceberg.data.Record)1 HiveCommitter (org.apache.inlong.sort.flink.hive.HiveCommitter)1 HiveWriter (org.apache.inlong.sort.flink.hive.HiveWriter)1 ClickHouseSinkInfo (org.apache.inlong.sort.protocol.sink.ClickHouseSinkInfo)1 HiveSinkInfo (org.apache.inlong.sort.protocol.sink.HiveSinkInfo)1 IcebergSinkInfo (org.apache.inlong.sort.protocol.sink.IcebergSinkInfo)1 KafkaSinkInfo (org.apache.inlong.sort.protocol.sink.KafkaSinkInfo)1 ClickhouseRowSinkFunction (org.apache.inlong.sort.singletenant.flink.clickhouse.ClickhouseRowSinkFunction)1