use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.
the class HiveTableSinkITCase method testStreamingSinkWithoutCommitPolicy.
@Test
public void testStreamingSinkWithoutCommitPolicy() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
tableEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
tableEnv.useCatalog(hiveCatalog.getName());
tableEnv.executeSql("create database db1");
try {
tableEnv.useDatabase("db1");
tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
tableEnv.executeSql("create table dest(x int) partitioned by (p string)");
tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
tableEnv.executeSql("create table src (i int, p string) with (" + "'connector'='datagen'," + "'number-of-rows'='5')");
tableEnv.executeSql("insert into dest select * from src").await();
fail("Streaming write partitioned table without commit policy should fail");
} catch (FlinkHiveException e) {
// expected
assertTrue(e.getMessage().contains(String.format("Streaming write to partitioned hive table `%s`.`%s`.`%s` without providing a commit policy", hiveCatalog.getName(), "db1", "dest")));
} finally {
tableEnv.executeSql("drop database db1 cascade");
}
}
use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.
the class HiveTableSinkITCase method testStreamingAppend.
@Test
public void testStreamingAppend() throws Exception {
testStreamingWrite(false, false, "parquet", (p) -> {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
tEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
tEnv.useCatalog(hiveCatalog.getName());
try {
tEnv.executeSql("insert into db1.sink_table select 6,'a','b','2020-05-03','12'").await();
} catch (Exception e) {
Assert.fail("Failed to execute sql: " + e.getMessage());
}
assertBatch("db1.sink_table", Arrays.asList("+I[1, a, b, 2020-05-03, 7]", "+I[1, a, b, 2020-05-03, 7]", "+I[2, p, q, 2020-05-03, 8]", "+I[2, p, q, 2020-05-03, 8]", "+I[3, x, y, 2020-05-03, 9]", "+I[3, x, y, 2020-05-03, 9]", "+I[4, x, y, 2020-05-03, 10]", "+I[4, x, y, 2020-05-03, 10]", "+I[5, x, y, 2020-05-03, 11]", "+I[5, x, y, 2020-05-03, 11]", "+I[6, a, b, 2020-05-03, 12]"));
});
}
use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.
the class HiveTableSinkITCase method testStreamingSinkWithTimestampLtzWatermark.
@Test
public void testStreamingSinkWithTimestampLtzWatermark() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.enableCheckpointing(100);
StreamTableEnvironment tEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
tEnv.getConfig().setLocalTimeZone(ZoneId.of("Asia/Shanghai"));
tEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
tEnv.useCatalog(hiveCatalog.getName());
tEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
try {
tEnv.executeSql("create database db1");
tEnv.useDatabase("db1");
// source table DDL
tEnv.executeSql("create external table source_table (" + " a int," + " b string," + " c string," + " epoch_ts bigint)" + " partitioned by (" + " pt_day string, pt_hour string) TBLPROPERTIES(" + "'partition.time-extractor.timestamp-pattern'='$pt_day $pt_hour:00:00'," + "'streaming-source.enable'='true'," + "'streaming-source.monitor-interval'='1s'," + "'streaming-source.consume-order'='partition-time'" + ")");
tEnv.executeSql("create external table sink_table (" + " a int," + " b string," + " c string)" + " partitioned by (" + " d string, e string) TBLPROPERTIES(" + " 'partition.time-extractor.timestamp-pattern' = '$d $e:00:00'," + " 'auto-compaction'='true'," + " 'compaction.file-size' = '128MB'," + " 'sink.partition-commit.trigger'='partition-time'," + " 'sink.partition-commit.delay'='30min'," + " 'sink.partition-commit.watermark-time-zone'='Asia/Shanghai'," + " 'sink.partition-commit.policy.kind'='metastore,success-file'," + " 'sink.partition-commit.success-file.name'='_MY_SUCCESS'," + " 'streaming-source.enable'='true'," + " 'streaming-source.monitor-interval'='1s'," + " 'streaming-source.consume-order'='partition-time'" + ")");
tEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
// Build a partitioned table source with watermark base on the streaming-hive table
DataStream<Row> dataStream = tEnv.toDataStream(tEnv.sqlQuery("select a, b, c, epoch_ts, pt_day, pt_hour from source_table"));
Table table = tEnv.fromDataStream(dataStream, Schema.newBuilder().column("a", DataTypes.INT()).column("b", DataTypes.STRING()).column("c", DataTypes.STRING()).column("epoch_ts", DataTypes.BIGINT()).column("pt_day", DataTypes.STRING()).column("pt_hour", DataTypes.STRING()).columnByExpression("ts_ltz", Expressions.callSql("TO_TIMESTAMP_LTZ(epoch_ts, 3)")).watermark("ts_ltz", "ts_ltz - INTERVAL '1' SECOND").build());
tEnv.createTemporaryView("my_table", table);
/*
* prepare test data, we write two records into each partition in source table
* the epoch mills used to define watermark, the watermark value is
* the max timestamp value of all the partition data, i.e:
* partition timestamp + 1 hour - 1 second in this case
*
* <pre>
* epoch mills 1588461300000L <=> local timestamp 2020-05-03 07:15:00 in Shanghai
* epoch mills 1588463100000L <=> local timestamp 2020-05-03 07:45:00 in Shanghai
* epoch mills 1588464300000L <=> local timestamp 2020-05-03 08:05:00 in Shanghai
* epoch mills 1588466400000L <=> local timestamp 2020-05-03 08:40:00 in Shanghai
* epoch mills 1588468800000L <=> local timestamp 2020-05-03 09:20:00 in Shanghai
* epoch mills 1588470900000L <=> local timestamp 2020-05-03 09:55:00 in Shanghai
* epoch mills 1588471800000L <=> local timestamp 2020-05-03 10:10:00 in Shanghai
* epoch mills 1588473300000L <=> local timestamp 2020-05-03 10:35:00 in Shanghai
* epoch mills 1588476300000L <=> local timestamp 2020-05-03 11:25:00 in Shanghai
* epoch mills 1588477800000L <=> local timestamp 2020-05-03 11:50:00 in Shanghai
* </pre>
*/
Map<Integer, Object[]> testData = new HashMap<>();
testData.put(1, new Object[] { 1, "a", "b", 1588461300000L });
testData.put(2, new Object[] { 1, "a", "b", 1588463100000L });
testData.put(3, new Object[] { 2, "p", "q", 1588464300000L });
testData.put(4, new Object[] { 2, "p", "q", 1588466400000L });
testData.put(5, new Object[] { 3, "x", "y", 1588468800000L });
testData.put(6, new Object[] { 3, "x", "y", 1588470900000L });
testData.put(7, new Object[] { 4, "x", "y", 1588471800000L });
testData.put(8, new Object[] { 4, "x", "y", 1588473300000L });
testData.put(9, new Object[] { 5, "x", "y", 1588476300000L });
testData.put(10, new Object[] { 5, "x", "y", 1588477800000L });
Map<Integer, String> testPartition = new HashMap<>();
testPartition.put(1, "pt_day='2020-05-03',pt_hour='7'");
testPartition.put(2, "pt_day='2020-05-03',pt_hour='8'");
testPartition.put(3, "pt_day='2020-05-03',pt_hour='9'");
testPartition.put(4, "pt_day='2020-05-03',pt_hour='10'");
testPartition.put(5, "pt_day='2020-05-03',pt_hour='11'");
Map<Integer, Object[]> expectedData = new HashMap<>();
expectedData.put(1, new Object[] { 1, "a", "b", "2020-05-03", "7" });
expectedData.put(2, new Object[] { 2, "p", "q", "2020-05-03", "8" });
expectedData.put(3, new Object[] { 3, "x", "y", "2020-05-03", "9" });
expectedData.put(4, new Object[] { 4, "x", "y", "2020-05-03", "10" });
expectedData.put(5, new Object[] { 5, "x", "y", "2020-05-03", "11" });
tEnv.executeSql("insert into sink_table select a, b, c, pt_day, pt_hour from my_table");
CloseableIterator<Row> iter = tEnv.executeSql("select * from sink_table").collect();
HiveTestUtils.createTextTableInserter(hiveCatalog, "db1", "source_table").addRow(testData.get(1)).addRow(testData.get(2)).commit(testPartition.get(1));
for (int i = 2; i < 7; i++) {
try {
Thread.sleep(1_000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
Assert.assertEquals(Arrays.asList(Row.of(expectedData.get(i - 1)).toString(), Row.of(expectedData.get(i - 1)).toString()), fetchRows(iter, 2));
if (i < 6) {
HiveTestUtils.createTextTableInserter(hiveCatalog, "db1", "source_table").addRow(testData.get(2 * i - 1)).addRow(testData.get(2 * i)).commit(testPartition.get(i));
}
}
this.checkSuccessFiles(URI.create(hiveCatalog.getHiveTable(ObjectPath.fromString("db1.sink_table")).getSd().getLocation()).getPath());
} finally {
tEnv.executeSql("drop database db1 cascade");
}
}
use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.
the class HiveTableSinkITCase method testStreamingWriteWithCustomPartitionCommitPolicy.
private void testStreamingWriteWithCustomPartitionCommitPolicy(String customPartitionCommitPolicyClassName) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.enableCheckpointing(100);
// avoid the job to restart infinitely
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 1_000));
StreamTableEnvironment tEnv = HiveTestUtils.createTableEnvInStreamingMode(env);
tEnv.registerCatalog(hiveCatalog.getName(), hiveCatalog);
tEnv.useCatalog(hiveCatalog.getName());
tEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
try {
tEnv.executeSql("create database db1");
tEnv.useDatabase("db1");
// prepare source
List<Row> data = Arrays.asList(Row.of(1, "a", "b", "2020-05-03", "7"), Row.of(2, "p", "q", "2020-05-03", "8"), Row.of(3, "x", "y", "2020-05-03", "9"), Row.of(4, "x", "y", "2020-05-03", "10"), Row.of(5, "x", "y", "2020-05-03", "11"));
DataStream<Row> stream = env.addSource(new FiniteTestSource<>(data), new RowTypeInfo(Types.INT, Types.STRING, Types.STRING, Types.STRING, Types.STRING));
tEnv.createTemporaryView("my_table", stream, $("a"), $("b"), $("c"), $("d"), $("e"));
// DDL
tEnv.executeSql("create external table sink_table (a int,b string,c string" + ") " + "partitioned by (d string,e string) " + " stored as textfile" + " TBLPROPERTIES (" + "'" + SINK_PARTITION_COMMIT_DELAY.key() + "'='1h'," + "'" + SINK_PARTITION_COMMIT_POLICY_KIND.key() + "'='metastore,custom'," + "'" + SINK_PARTITION_COMMIT_POLICY_CLASS.key() + "'='" + customPartitionCommitPolicyClassName + "'" + ")");
// hive dialect only works with hive tables at the moment, switch to default dialect
tEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
tEnv.sqlQuery("select * from my_table").executeInsert("sink_table").await();
// check committed partitions for CustomizedCommitPolicy
Set<String> committedPaths = TestCustomCommitPolicy.getCommittedPartitionPathsAndReset();
String base = URI.create(hiveCatalog.getHiveTable(ObjectPath.fromString("db1.sink_table")).getSd().getLocation()).getPath();
List<String> partitionKVs = Lists.newArrayList("e=7", "e=8", "e=9", "e=10", "e=11");
partitionKVs.forEach(partitionKV -> {
String partitionPath = new Path(new Path(base, "d=2020-05-03"), partitionKV).toString();
Assert.assertTrue("Partition(d=2020-05-03, " + partitionKV + ") is not committed successfully", committedPaths.contains(partitionPath));
});
} finally {
tEnv.executeSql("drop database if exists db1 cascade");
}
}
use of org.apache.flink.table.api.bridge.java.StreamTableEnvironment in project flink by apache.
the class HiveCatalogUdfITCase method testUdf.
private void testUdf(boolean batch) throws Exception {
StreamExecutionEnvironment env = null;
TableEnvironment tEnv;
EnvironmentSettings.Builder settingsBuilder = EnvironmentSettings.newInstance();
if (batch) {
settingsBuilder.inBatchMode();
} else {
settingsBuilder.inStreamingMode();
}
if (batch) {
tEnv = TableEnvironment.create(settingsBuilder.build());
} else {
env = StreamExecutionEnvironment.getExecutionEnvironment();
tEnv = StreamTableEnvironment.create(env, settingsBuilder.build());
}
BatchTestBase.configForMiniCluster(tEnv.getConfig());
tEnv.registerCatalog("myhive", hiveCatalog);
tEnv.useCatalog("myhive");
String innerSql = format("select mygenericudf(myudf(name), 1) as a, mygenericudf(myudf(age), 1) as b," + " s from %s, lateral table(myudtf(name, 1)) as T(s)", sourceTableName);
String selectSql = format("select a, s, sum(b), myudaf(b) from (%s) group by a, s", innerSql);
List<String> results;
if (batch) {
Path p = Paths.get(tempFolder.newFolder().getAbsolutePath(), "test.csv");
final TableSchema sinkSchema = TableSchema.builder().field("name1", Types.STRING()).field("name2", Types.STRING()).field("sum1", Types.INT()).field("sum2", Types.LONG()).build();
final Map<String, String> sinkOptions = new HashMap<>();
sinkOptions.put("connector.type", "filesystem");
sinkOptions.put("connector.path", p.toAbsolutePath().toString());
sinkOptions.put("format.type", "csv");
final CatalogTable sink = new CatalogTableImpl(sinkSchema, sinkOptions, "Comment.");
hiveCatalog.createTable(new ObjectPath(HiveCatalog.DEFAULT_DB, sinkTableName), sink, false);
tEnv.executeSql(format("insert into %s " + selectSql, sinkTableName)).await();
// assert written result
StringBuilder builder = new StringBuilder();
try (Stream<Path> paths = Files.walk(Paths.get(p.toAbsolutePath().toString()))) {
paths.filter(Files::isRegularFile).forEach(path -> {
try {
String content = FileUtils.readFileUtf8(path.toFile());
if (content.isEmpty()) {
return;
}
builder.append(content);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
results = Arrays.stream(builder.toString().split("\n")).filter(s -> !s.isEmpty()).collect(Collectors.toList());
} else {
StreamTableEnvironment streamTEnv = (StreamTableEnvironment) tEnv;
TestingRetractSink sink = new TestingRetractSink();
streamTEnv.toRetractStream(tEnv.sqlQuery(selectSql), Row.class).map(new JavaToScala()).addSink((SinkFunction) sink);
env.execute("");
results = JavaScalaConversionUtil.toJava(sink.getRetractResults());
}
results = new ArrayList<>(results);
results.sort(String::compareTo);
Assert.assertEquals(Arrays.asList("1,1,2,2", "2,2,4,4", "3,3,6,6"), results);
}
Aggregations