use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class CreateStatementBuilderTest method testRowDelimitedCreate.
@Test
public void testRowDelimitedCreate() throws Exception {
String expected = "CREATE EXTERNAL TABLE IF NOT EXISTS abc.dataset_myfiles " + "(f1 string, f2 int, f3 double, f4 binary, f5 array<int>) COMMENT 'CDAP Dataset' " + "PARTITIONED BY (f1 STRING, f2 INT) " + "ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' " + "STORED AS TEXTFILE LOCATION 'hdfs://namenode/my/path' " + "TBLPROPERTIES ('cdap.name'='myfiles', 'cdap.version'='" + ProjectInfo.getVersion().toString() + "')";
String hiveSchema = "f1 string, f2 int, f3 double, f4 binary, f5 array<int>";
Partitioning partitioning = Partitioning.builder().addStringField("f1").addIntField("f2").build();
String actual = new CreateStatementBuilder("myfiles", "abc", "dataset_myfiles", false).setSchema(hiveSchema).setLocation("hdfs://namenode/my/path").setTableComment("CDAP Dataset").setPartitioning(partitioning).setRowFormatDelimited(",", null).buildWithFileFormat("TEXTFILE");
Assert.assertEquals(expected, actual);
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class CreateStatementBuilderTest method testRowSerdeCreate.
@Test
public void testRowSerdeCreate() throws Exception {
String expected = "CREATE EXTERNAL TABLE IF NOT EXISTS dataset_myfiles " + "(f1 string, f2 int, f3 double, f4 binary, f5 array<int>) COMMENT 'CDAP Dataset' " + "PARTITIONED BY (f1 STRING, f2 INT) " + "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' " + "WITH SERDEPROPERTIES ('input.regex'='escapeme!\\'') " + "STORED AS TEXTFILE LOCATION 'hdfs://namenode/my/path' " + "TBLPROPERTIES ('cdap.name'='myfiles', 'cdap.version'='" + ProjectInfo.getVersion().toString() + "')";
String hiveSchema = "f1 string, f2 int, f3 double, f4 binary, f5 array<int>";
Partitioning partitioning = Partitioning.builder().addStringField("f1").addIntField("f2").build();
String actual = new CreateStatementBuilder("myfiles", null, "dataset_myfiles", false).setSchema(hiveSchema).setLocation("hdfs://namenode/my/path").setTableComment("CDAP Dataset").setPartitioning(partitioning).setRowFormatSerde("org.apache.hadoop.hive.serde2.RegexSerDe", ImmutableMap.of("input.regex", "escapeme!'")).buildWithFileFormat("TEXTFILE");
Assert.assertEquals(expected, actual);
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class CreateStatementBuilderTest method testRowSerdeFormatsCreate.
@Test
public void testRowSerdeFormatsCreate() throws Exception {
Schema schema = Schema.recordOf("record", Schema.Field.of("f1", Schema.of(Schema.Type.STRING)), Schema.Field.of("f2", Schema.of(Schema.Type.INT)), Schema.Field.of("f3", Schema.of(Schema.Type.DOUBLE)));
String expected = "CREATE EXTERNAL TABLE IF NOT EXISTS dataset_myfiles (f1 string, f2 int, f3 double) " + "COMMENT 'CDAP Dataset' " + "PARTITIONED BY (f1 STRING, f2 INT) " + "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' " + "STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' " + "OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' " + "LOCATION 'hdfs://namenode/my/path' " + "TBLPROPERTIES ('avro.schema.literal'='" + schema.toString() + "', " + "'cdap.name'='myfiles', 'cdap.version'='" + ProjectInfo.getVersion().toString() + "')";
Partitioning partitioning = Partitioning.builder().addStringField("f1").addIntField("f2").build();
String actual = new CreateStatementBuilder("myfiles", null, "dataset_myfiles", false).setSchema(schema).setTableProperties(ImmutableMap.of("avro.schema.literal", schema.toString())).setLocation("hdfs://namenode/my/path").setTableComment("CDAP Dataset").setPartitioning(partitioning).setRowFormatSerde("org.apache.hadoop.hive.serde2.avro.AvroSerDe").buildWithFormats("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat");
Assert.assertEquals(expected, actual);
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class ConnectorSource method configure.
// not the standard configurePipeline method. Need a workflowConfigurer to create a local dataset
// we may want to expose local datasets in cdap-etl-api, but that is a separate track.
public void configure(WorkflowConfigurer workflowConfigurer) {
Partitioning partitioning = Partitioning.builder().addField("phase", Partitioning.FieldType.STRING).build();
workflowConfigurer.createLocalDataset(datasetName, PartitionedFileSet.class, PartitionedFileSetProperties.builder().setPartitioning(partitioning).setInputFormat(TextInputFormat.class).setOutputFormat(TextOutputFormat.class).build());
}
use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.
the class PartitioningTest method testBuilderGetters.
@Test
public void testBuilderGetters() {
Partitioning partitioning = Partitioning.builder().addField("a", FieldType.STRING).addField("b", FieldType.INT).addField("c", FieldType.LONG).addStringField("d").addIntField("e").addLongField("f").build();
Assert.assertEquals(FieldType.STRING, partitioning.getFieldType("a"));
Assert.assertEquals(FieldType.INT, partitioning.getFieldType("b"));
Assert.assertEquals(FieldType.LONG, partitioning.getFieldType("c"));
Assert.assertEquals(FieldType.STRING, partitioning.getFieldType("d"));
Assert.assertEquals(FieldType.INT, partitioning.getFieldType("e"));
Assert.assertEquals(FieldType.LONG, partitioning.getFieldType("f"));
Assert.assertNull(partitioning.getFieldType("x"));
Assert.assertEquals(partitioning.getFields().keySet(), ImmutableSet.of("a", "b", "c", "d", "e", "f"));
}
Aggregations