Examples with Partitioning - co.cask.cdap.api.dataset.lib.Partitioning

Example 6 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class CreateStatementBuilderTest method testRowDelimitedCreate.

@Test
public void testRowDelimitedCreate() throws Exception {
    String expected = "CREATE EXTERNAL TABLE IF NOT EXISTS abc.dataset_myfiles " + "(f1 string, f2 int, f3 double, f4 binary, f5 array<int>) COMMENT 'CDAP Dataset' " + "PARTITIONED BY (f1 STRING, f2 INT) " + "ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' " + "STORED AS TEXTFILE LOCATION 'hdfs://namenode/my/path' " + "TBLPROPERTIES ('cdap.name'='myfiles', 'cdap.version'='" + ProjectInfo.getVersion().toString() + "')";
    String hiveSchema = "f1 string, f2 int, f3 double, f4 binary, f5 array<int>";
    Partitioning partitioning = Partitioning.builder().addStringField("f1").addIntField("f2").build();
    String actual = new CreateStatementBuilder("myfiles", "abc", "dataset_myfiles", false).setSchema(hiveSchema).setLocation("hdfs://namenode/my/path").setTableComment("CDAP Dataset").setPartitioning(partitioning).setRowFormatDelimited(",", null).buildWithFileFormat("TEXTFILE");
    Assert.assertEquals(expected, actual);
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) Test(org.junit.Test)

Example 7 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class CreateStatementBuilderTest method testRowSerdeCreate.

@Test
public void testRowSerdeCreate() throws Exception {
    String expected = "CREATE EXTERNAL TABLE IF NOT EXISTS dataset_myfiles " + "(f1 string, f2 int, f3 double, f4 binary, f5 array<int>) COMMENT 'CDAP Dataset' " + "PARTITIONED BY (f1 STRING, f2 INT) " + "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' " + "WITH SERDEPROPERTIES ('input.regex'='escapeme!\\'') " + "STORED AS TEXTFILE LOCATION 'hdfs://namenode/my/path' " + "TBLPROPERTIES ('cdap.name'='myfiles', 'cdap.version'='" + ProjectInfo.getVersion().toString() + "')";
    String hiveSchema = "f1 string, f2 int, f3 double, f4 binary, f5 array<int>";
    Partitioning partitioning = Partitioning.builder().addStringField("f1").addIntField("f2").build();
    String actual = new CreateStatementBuilder("myfiles", null, "dataset_myfiles", false).setSchema(hiveSchema).setLocation("hdfs://namenode/my/path").setTableComment("CDAP Dataset").setPartitioning(partitioning).setRowFormatSerde("org.apache.hadoop.hive.serde2.RegexSerDe", ImmutableMap.of("input.regex", "escapeme!'")).buildWithFileFormat("TEXTFILE");
    Assert.assertEquals(expected, actual);
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) Test(org.junit.Test)

Example 8 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class CreateStatementBuilderTest method testRowSerdeFormatsCreate.

@Test
public void testRowSerdeFormatsCreate() throws Exception {
    Schema schema = Schema.recordOf("record", Schema.Field.of("f1", Schema.of(Schema.Type.STRING)), Schema.Field.of("f2", Schema.of(Schema.Type.INT)), Schema.Field.of("f3", Schema.of(Schema.Type.DOUBLE)));
    String expected = "CREATE EXTERNAL TABLE IF NOT EXISTS dataset_myfiles (f1 string, f2 int, f3 double) " + "COMMENT 'CDAP Dataset' " + "PARTITIONED BY (f1 STRING, f2 INT) " + "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' " + "STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' " + "OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' " + "LOCATION 'hdfs://namenode/my/path' " + "TBLPROPERTIES ('avro.schema.literal'='" + schema.toString() + "', " + "'cdap.name'='myfiles', 'cdap.version'='" + ProjectInfo.getVersion().toString() + "')";
    Partitioning partitioning = Partitioning.builder().addStringField("f1").addIntField("f2").build();
    String actual = new CreateStatementBuilder("myfiles", null, "dataset_myfiles", false).setSchema(schema).setTableProperties(ImmutableMap.of("avro.schema.literal", schema.toString())).setLocation("hdfs://namenode/my/path").setTableComment("CDAP Dataset").setPartitioning(partitioning).setRowFormatSerde("org.apache.hadoop.hive.serde2.avro.AvroSerDe").buildWithFormats("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat");
    Assert.assertEquals(expected, actual);
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) Schema(co.cask.cdap.api.data.schema.Schema) Test(org.junit.Test)

Example 9 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class ConnectorSource method configure.

// not the standard configurePipeline method. Need a workflowConfigurer to create a local dataset
// we may want to expose local datasets in cdap-etl-api, but that is a separate track.
public void configure(WorkflowConfigurer workflowConfigurer) {
    Partitioning partitioning = Partitioning.builder().addField("phase", Partitioning.FieldType.STRING).build();
    workflowConfigurer.createLocalDataset(datasetName, PartitionedFileSet.class, PartitionedFileSetProperties.builder().setPartitioning(partitioning).setInputFormat(TextInputFormat.class).setOutputFormat(TextOutputFormat.class).build());
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat)

Example 10 with Partitioning

use of co.cask.cdap.api.dataset.lib.Partitioning in project cdap by caskdata.

the class PartitioningTest method testBuilderGetters.

@Test
public void testBuilderGetters() {
    Partitioning partitioning = Partitioning.builder().addField("a", FieldType.STRING).addField("b", FieldType.INT).addField("c", FieldType.LONG).addStringField("d").addIntField("e").addLongField("f").build();
    Assert.assertEquals(FieldType.STRING, partitioning.getFieldType("a"));
    Assert.assertEquals(FieldType.INT, partitioning.getFieldType("b"));
    Assert.assertEquals(FieldType.LONG, partitioning.getFieldType("c"));
    Assert.assertEquals(FieldType.STRING, partitioning.getFieldType("d"));
    Assert.assertEquals(FieldType.INT, partitioning.getFieldType("e"));
    Assert.assertEquals(FieldType.LONG, partitioning.getFieldType("f"));
    Assert.assertNull(partitioning.getFieldType("x"));
    Assert.assertEquals(partitioning.getFields().keySet(), ImmutableSet.of("a", "b", "c", "d", "e", "f"));
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) Test(org.junit.Test)

Aggregations

Partitioning (co.cask.cdap.api.dataset.lib.Partitioning)13 Test (org.junit.Test)5 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)4 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)3 IOException (java.io.IOException)3 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)2 Dataset (co.cask.cdap.api.dataset.Dataset)2 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)2 DatasetProperties (co.cask.cdap.api.dataset.DatasetProperties)2 BadRequestException (co.cask.cdap.common.BadRequestException)2 SystemDatasetInstantiator (co.cask.cdap.data.dataset.SystemDatasetInstantiator)2 ExploreException (co.cask.cdap.explore.service.ExploreException)2 QueryHandle (co.cask.cdap.proto.QueryHandle)2 TypeToken (com.google.common.reflect.TypeToken)2 JsonObject (com.google.gson.JsonObject)2 JsonSyntaxException (com.google.gson.JsonSyntaxException)2 InputStreamReader (java.io.InputStreamReader)2 Reader (java.io.Reader)2 SQLException (java.sql.SQLException)2 ChannelBufferInputStream (org.jboss.netty.buffer.ChannelBufferInputStream)2