Search in sources :

Example 11 with StreamTableEnvironment

use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.

the class HiveTableSinkITCase method testStreamingSinkWithoutCommitPolicy.

@Test
public void testStreamingSinkWithoutCommitPolicy() throws Exception {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    StreamTableEnvironment tableEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
    tableEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
    tableEnv.useCatalog(hiveCatalog.getName());
    tableEnv.executeSql("create database db1");
    try {
        tableEnv.useDatabase("db1");
        tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
        tableEnv.executeSql("create table dest(x int) partitioned by (p string)");
        tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
        tableEnv.executeSql("create table src (i int, p string) with (" + "'connector'='datagen'," + "'number-of-rows'='5')");
        tableEnv.executeSql("insert into dest select * from src").await();
        fail("Streaming write partitioned table without commit policy should fail");
    } catch (FlinkHiveException e) {
        // expected
        assertTrue(e.getMessage().contains(String.format("Streaming write to partitioned hive table `%s`.`%s`.`%s` without providing a commit policy", hiveCatalog.getName(), "db1", "dest")));
    } finally {
        tableEnv.executeSql("drop database db1 cascade");
    }
}
Also used : StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) Test(org.junit.Test)

Example 12 with StreamTableEnvironment

use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.

the class HiveTableSinkITCase method testStreamingAppend.

@Test
public void testStreamingAppend() throws Exception {
    testStreamingWrite(false, false, "parquet", (p) -> {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        StreamTableEnvironment tEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
        tEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
        tEnv.useCatalog(hiveCatalog.getName());
        try {
            tEnv.executeSql("insert into db1.sink_table select 6,'a','b','2020-05-03','12'").await();
        } catch (Exception e) {
            Assert.fail("Failed to execute sql: " + e.getMessage());
        }
        assertBatch("db1.sink_table", Arrays.asList("+I[1, a, b, 2020-05-03, 7]", "+I[1, a, b, 2020-05-03, 7]", "+I[2, p, q, 2020-05-03, 8]", "+I[2, p, q, 2020-05-03, 8]", "+I[3, x, y, 2020-05-03, 9]", "+I[3, x, y, 2020-05-03, 9]", "+I[4, x, y, 2020-05-03, 10]", "+I[4, x, y, 2020-05-03, 10]", "+I[5, x, y, 2020-05-03, 11]", "+I[5, x, y, 2020-05-03, 11]", "+I[6, a, b, 2020-05-03, 12]"));
    });
}
Also used : StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) IOException(java.io.IOException) Test(org.junit.Test)

Example 13 with StreamTableEnvironment

use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.

the class HiveTableSinkITCase method testStreamingSinkWithTimestampLtzWatermark.

@Test
public void testStreamingSinkWithTimestampLtzWatermark() throws Exception {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(1);
    env.enableCheckpointing(100);
    StreamTableEnvironment tEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
    tEnv.getConfig().setLocalTimeZone(ZoneId.of("Asia/Shanghai"));
    tEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
    tEnv.useCatalog(hiveCatalog.getName());
    tEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
    try {
        tEnv.executeSql("create database db1");
        tEnv.useDatabase("db1");
        // source table DDL
        tEnv.executeSql("create external table source_table (" + " a int," + " b string," + " c string," + " epoch_ts bigint)" + " partitioned by (" + " pt_day string, pt_hour string) TBLPROPERTIES(" + "'partition.time-extractor.timestamp-pattern'='$pt_day $pt_hour:00:00'," + "'streaming-source.enable'='true'," + "'streaming-source.monitor-interval'='1s'," + "'streaming-source.consume-order'='partition-time'" + ")");
        tEnv.executeSql("create external table sink_table (" + " a int," + " b string," + " c string)" + " partitioned by (" + " d string, e string) TBLPROPERTIES(" + " 'partition.time-extractor.timestamp-pattern' = '$d $e:00:00'," + " 'auto-compaction'='true'," + " 'compaction.file-size' = '128MB'," + " 'sink.partition-commit.trigger'='partition-time'," + " 'sink.partition-commit.delay'='30min'," + " 'sink.partition-commit.watermark-time-zone'='Asia/Shanghai'," + " 'sink.partition-commit.policy.kind'='metastore,success-file'," + " 'sink.partition-commit.success-file.name'='_MY_SUCCESS'," + " 'streaming-source.enable'='true'," + " 'streaming-source.monitor-interval'='1s'," + " 'streaming-source.consume-order'='partition-time'" + ")");
        tEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
        // Build a partitioned table source with watermark base on the streaming-hive table
        DataStream<Row> dataStream = tEnv.toDataStream(tEnv.sqlQuery("select a, b, c, epoch_ts, pt_day, pt_hour from source_table"));
        Table table = tEnv.fromDataStream(dataStream, Schema.newBuilder().column("a", DataTypes.INT()).column("b", DataTypes.STRING()).column("c", DataTypes.STRING()).column("epoch_ts", DataTypes.BIGINT()).column("pt_day", DataTypes.STRING()).column("pt_hour", DataTypes.STRING()).columnByExpression("ts_ltz", Expressions.callSql("TO_TIMESTAMP_LTZ(epoch_ts, 3)")).watermark("ts_ltz", "ts_ltz - INTERVAL '1' SECOND").build());
        tEnv.createTemporaryView("my_table", table);
        /*
             * prepare test data, we write two records into each partition in source table
             * the epoch mills used to define watermark, the watermark value is
             * the max timestamp value of all the partition data, i.e:
             * partition timestamp + 1 hour - 1 second in this case
             *
             * <pre>
             * epoch mills 1588461300000L <=>  local timestamp 2020-05-03 07:15:00 in Shanghai
             * epoch mills 1588463100000L <=>  local timestamp 2020-05-03 07:45:00 in Shanghai
             * epoch mills 1588464300000L <=>  local timestamp 2020-05-03 08:05:00 in Shanghai
             * epoch mills 1588466400000L <=>  local timestamp 2020-05-03 08:40:00 in Shanghai
             * epoch mills 1588468800000L <=>  local timestamp 2020-05-03 09:20:00 in Shanghai
             * epoch mills 1588470900000L <=>  local timestamp 2020-05-03 09:55:00 in Shanghai
             * epoch mills 1588471800000L <=>  local timestamp 2020-05-03 10:10:00 in Shanghai
             * epoch mills 1588473300000L <=>  local timestamp 2020-05-03 10:35:00 in Shanghai
             * epoch mills 1588476300000L <=>  local timestamp 2020-05-03 11:25:00 in Shanghai
             * epoch mills 1588477800000L <=>  local timestamp 2020-05-03 11:50:00 in Shanghai
             * </pre>
             */
        Map<Integer, Object[]> testData = new HashMap<>();
        testData.put(1, new Object[] { 1, "a", "b", 1588461300000L });
        testData.put(2, new Object[] { 1, "a", "b", 1588463100000L });
        testData.put(3, new Object[] { 2, "p", "q", 1588464300000L });
        testData.put(4, new Object[] { 2, "p", "q", 1588466400000L });
        testData.put(5, new Object[] { 3, "x", "y", 1588468800000L });
        testData.put(6, new Object[] { 3, "x", "y", 1588470900000L });
        testData.put(7, new Object[] { 4, "x", "y", 1588471800000L });
        testData.put(8, new Object[] { 4, "x", "y", 1588473300000L });
        testData.put(9, new Object[] { 5, "x", "y", 1588476300000L });
        testData.put(10, new Object[] { 5, "x", "y", 1588477800000L });
        Map<Integer, String> testPartition = new HashMap<>();
        testPartition.put(1, "pt_day='2020-05-03',pt_hour='7'");
        testPartition.put(2, "pt_day='2020-05-03',pt_hour='8'");
        testPartition.put(3, "pt_day='2020-05-03',pt_hour='9'");
        testPartition.put(4, "pt_day='2020-05-03',pt_hour='10'");
        testPartition.put(5, "pt_day='2020-05-03',pt_hour='11'");
        Map<Integer, Object[]> expectedData = new HashMap<>();
        expectedData.put(1, new Object[] { 1, "a", "b", "2020-05-03", "7" });
        expectedData.put(2, new Object[] { 2, "p", "q", "2020-05-03", "8" });
        expectedData.put(3, new Object[] { 3, "x", "y", "2020-05-03", "9" });
        expectedData.put(4, new Object[] { 4, "x", "y", "2020-05-03", "10" });
        expectedData.put(5, new Object[] { 5, "x", "y", "2020-05-03", "11" });
        tEnv.executeSql("insert into sink_table select a, b, c, pt_day, pt_hour from my_table");
        CloseableIterator<Row> iter = tEnv.executeSql("select * from sink_table").collect();
        HiveTestUtils.createTextTableInserter(hiveCatalog, "db1", "source_table").addRow(testData.get(1)).addRow(testData.get(2)).commit(testPartition.get(1));
        for (int i = 2; i < 7; i++) {
            try {
                Thread.sleep(1_000);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            Assert.assertEquals(Arrays.asList(Row.of(expectedData.get(i - 1)).toString(), Row.of(expectedData.get(i - 1)).toString()), fetchRows(iter, 2));
            if (i < 6) {
                HiveTestUtils.createTextTableInserter(hiveCatalog, "db1", "source_table").addRow(testData.get(2 * i - 1)).addRow(testData.get(2 * i)).commit(testPartition.get(i));
            }
        }
        this.checkSuccessFiles(URI.create(hiveCatalog.getHiveTable(ObjectPath.fromString("db1.sink_table")).getSd().getLocation()).getPath());
    } finally {
        tEnv.executeSql("drop database db1 cascade");
    }
}
Also used : Table(org.apache.flink.table.api.Table) HashMap(java.util.HashMap) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) Row(org.apache.flink.types.Row) Test(org.junit.Test)

Example 14 with StreamTableEnvironment

use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.

the class HiveTableSinkITCase method testStreamingWriteWithCustomPartitionCommitPolicy.

private void testStreamingWriteWithCustomPartitionCommitPolicy(String customPartitionCommitPolicyClassName) throws Exception {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(1);
    env.enableCheckpointing(100);
    // avoid the job to restart infinitely
    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 1_000));
    StreamTableEnvironment tEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
    tEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
    tEnv.useCatalog(hiveCatalog.getName());
    tEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
    try {
        tEnv.executeSql("create database db1");
        tEnv.useDatabase("db1");
        // prepare source
        List<Row> data = Arrays.asList(Row.of(1, "a", "b", "2020-05-03", "7"), Row.of(2, "p", "q", "2020-05-03", "8"), Row.of(3, "x", "y", "2020-05-03", "9"), Row.of(4, "x", "y", "2020-05-03", "10"), Row.of(5, "x", "y", "2020-05-03", "11"));
        DataStream<Row> stream = env.addSource(new FiniteTestSource<>(data), new RowTypeInfo(Types.INT, Types.STRING, Types.STRING, Types.STRING, Types.STRING));
        tEnv.createTemporaryView("my_table", stream, $("a"), $("b"), $("c"), $("d"), $("e"));
        // DDL
        tEnv.executeSql("create external table sink_table (a int,b string,c string" + ") " + "partitioned by (d string,e string) " + " stored as textfile" + " TBLPROPERTIES (" + "'" + SINK_PARTITION_COMMIT_DELAY.key() + "'='1h'," + "'" + SINK_PARTITION_COMMIT_POLICY_KIND.key() + "'='metastore,custom'," + "'" + SINK_PARTITION_COMMIT_POLICY_CLASS.key() + "'='" + customPartitionCommitPolicyClassName + "'" + ")");
        // hive dialect only works with hive tables at the moment, switch to default dialect
        tEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
        tEnv.sqlQuery("select * from my_table").executeInsert("sink_table").await();
        // check committed partitions for CustomizedCommitPolicy
        Set<String> committedPaths = TestCustomCommitPolicy.getCommittedPartitionPathsAndReset();
        String base = URI.create(hiveCatalog.getHiveTable(ObjectPath.fromString("db1.sink_table")).getSd().getLocation()).getPath();
        List<String> partitionKVs = Lists.newArrayList("e=7", "e=8", "e=9", "e=10", "e=11");
        partitionKVs.forEach(partitionKV -> {
            String partitionPath = new Path(new Path(base, "d=2020-05-03"), partitionKV).toString();
            Assert.assertTrue("Partition(d=2020-05-03, " + partitionKV + ") is not committed successfully", committedPaths.contains(partitionPath));
        });
    } finally {
        tEnv.executeSql("drop database if exists db1 cascade");
    }
}
Also used : Path(org.apache.flink.core.fs.Path) ObjectPath(org.apache.flink.table.catalog.ObjectPath) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) Row(org.apache.flink.types.Row) RowTypeInfo(org.apache.flink.api.java.typeutils.RowTypeInfo)

Example 15 with StreamTableEnvironment

use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.

the class HiveCatalogUdfITCase method testUdf.

private void testUdf(boolean batch) throws Exception {
    StreamExecutionEnvironment env = null;
    TableEnvironment tEnv;
    EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance();
    if (batch) {
        settingsBuilder.inBatchMode();
    } else {
        settingsBuilder.inStreamingMode();
    }
    if (batch) {
        tEnv = TableEnvironment.create(settingsBuilder.build());
    } else {
        env = StreamExecutionEnvironment.getExecutionEnvironment();
        tEnv = StreamTableEnvironment.create(env, settingsBuilder.build());
    }
    BatchTestBase.configForMiniCluster(tEnv.getConfig());
    tEnv.registerCatalog("myhive", hiveCatalog);
    tEnv.useCatalog("myhive");
    String innerSql = format("select mygenericudf(myudf(name), 1) as a, mygenericudf(myudf(age), 1) as b," + " s from %s, lateral table(myudtf(name, 1)) as T(s)", sourceTableName);
    String selectSql = format("select a, s, sum(b), myudaf(b) from (%s) group by a, s", innerSql);
    List<String> results;
    if (batch) {
        Path p = Paths.get(tempFolder.newFolder().getAbsolutePath(), "test.csv");
        final TableSchema sinkSchema = TableSchema.builder().field("name1", Types.STRING()).field("name2", Types.STRING()).field("sum1", Types.INT()).field("sum2", Types.LONG()).build();
        final Map<String, String> sinkOptions = new HashMap<>();
        sinkOptions.put("connector.type", "filesystem");
        sinkOptions.put("connector.path", p.toAbsolutePath().toString());
        sinkOptions.put("format.type", "csv");
        final CatalogTable sink = new CatalogTableImpl(sinkSchema, sinkOptions, "Comment.");
        hiveCatalog.createTable(new ObjectPath(HiveCatalog.DEFAULT_DB, sinkTableName), sink, false);
        tEnv.executeSql(format("insert into %s " + selectSql, sinkTableName)).await();
        // assert written result
        StringBuilder builder = new StringBuilder();
        try (Stream<Path> paths = Files.walk(Paths.get(p.toAbsolutePath().toString()))) {
            paths.filter(Files::isRegularFile).forEach(path -> {
                try {
                    String content = FileUtils.readFileUtf8(path.toFile());
                    if (content.isEmpty()) {
                        return;
                    }
                    builder.append(content);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            });
        }
        results = Arrays.stream(builder.toString().split("\n")).filter(s -> !s.isEmpty()).collect(Collectors.toList());
    } else {
        StreamTableEnvironment streamTEnv = (StreamTableEnvironment) tEnv;
        TestingRetractSink sink = new TestingRetractSink();
        streamTEnv.toRetractStream(tEnv.sqlQuery(selectSql), Row.class).map(new JavaToScala()).addSink((SinkFunction) sink);
        env.execute("");
        results = JavaScalaConversionUtil.toJava(sink.getRetractResults());
    }
    results = new ArrayList<>(results);
    results.sort(String::compareTo);
    Assert.assertEquals(Arrays.asList("1,1,2,2", "2,2,4,4", "3,3,6,6"), results);
}
Also used : ObjectPath(org.apache.flink.table.catalog.ObjectPath) Path(java.nio.file.Path) EnvironmentSettings(org.apache.flink.table.api.EnvironmentSettings) ObjectPath(org.apache.flink.table.catalog.ObjectPath) TableSchema(org.apache.flink.table.api.TableSchema) HashMap(java.util.HashMap) TestingRetractSink(org.apache.flink.table.planner.runtime.utils.TestingRetractSink) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) TableEnvironment(org.apache.flink.table.api.TableEnvironment) CatalogTable(org.apache.flink.table.catalog.CatalogTable) IOException(java.io.IOException) CatalogTableImpl(org.apache.flink.table.catalog.CatalogTableImpl) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment)

Aggregations

StreamTableEnvironment (org.apache.flink.table.api.bridge.java.StreamTableEnvironment)64 Test (org.junit.Test)53 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)41 Row (org.apache.flink.types.Row)38 Table (org.apache.flink.table.api.Table)36 ArrayList (java.util.ArrayList)19 TableResult (org.apache.flink.table.api.TableResult)18 List (java.util.List)10 TableDescriptor (org.apache.flink.table.api.TableDescriptor)10 Arrays (java.util.Arrays)6 Collections (java.util.Collections)6 AbstractTestBase (org.apache.flink.test.util.AbstractTestBase)6 IOException (java.io.IOException)5 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)5 ResolvedSchema (org.apache.flink.table.catalog.ResolvedSchema)5 Either (org.apache.flink.types.Either)5 LocalDateTime (java.time.LocalDateTime)4 ZoneId (java.time.ZoneId)4 TypeHint (org.apache.flink.api.common.typeinfo.TypeHint)4 TypeInformation (org.apache.flink.api.common.typeinfo.TypeInformation)4