Examples with ColumnDesc - co.cask.cdap.proto.ColumnDesc

Example 26 with ColumnDesc

use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreStructuredRecordTestRun method testRecordScannableAndWritableIsOK.

@Test
public void testRecordScannableAndWritableIsOK() throws Exception {
    DatasetId instanceId = NAMESPACE_ID.dataset("tabul");
    datasetFramework.addInstance("TableWrapper", instanceId, DatasetProperties.builder().add(DatasetProperties.SCHEMA, Schema.recordOf("intRecord", Schema.Field.of("x", Schema.of(Schema.Type.STRING))).toString()).build());
    DatasetSpecification spec = datasetFramework.getDatasetSpec(instanceId);
    try {
        exploreTableManager.enableDataset(instanceId, spec, false);
        runCommand(NAMESPACE_ID, "describe dataset_tabul", true, Lists.newArrayList(new ColumnDesc("col_name", "STRING", 1, "from deserializer"), new ColumnDesc("data_type", "STRING", 2, "from deserializer"), new ColumnDesc("comment", "STRING", 3, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x", "string", "from deserializer"))));
    } finally {
        datasetFramework.deleteInstance(instanceId);
    }
}

Also used : QueryResult(co.cask.cdap.proto.QueryResult) DatasetSpecification(co.cask.cdap.api.dataset.DatasetSpecification) ColumnDesc(co.cask.cdap.proto.ColumnDesc) DatasetId(co.cask.cdap.proto.id.DatasetId) Test(org.junit.Test)

Example 27 with ColumnDesc

use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreStructuredRecordTestRun method testInsert.

@Test
public void testInsert() throws Exception {
    DatasetId copyTable = NAMESPACE_ID.dataset("emailCopy");
    datasetFramework.addInstance(Table.class.getName(), copyTable, TableProperties.builder().setSchema(EmailTableDefinition.SCHEMA).setRowFieldName("id").build());
    try {
        String command = String.format("insert into %s select * from %s", getDatasetHiveName(copyTable), MY_TABLE_NAME);
        ExploreExecutionResult result = exploreClient.submit(NAMESPACE_ID, command).get();
        Assert.assertEquals(QueryStatus.OpStatus.FINISHED, result.getStatus().getStatus());
        command = String.format("select id, subject, body, sender from %s", getDatasetHiveName(copyTable));
        runCommand(NAMESPACE_ID, command, true, Lists.newArrayList(new ColumnDesc("id", "STRING", 1, null), new ColumnDesc("subject", "STRING", 2, null), new ColumnDesc("body", "STRING", 3, null), new ColumnDesc("sender", "STRING", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("email1", "this is the subject", "this is the body", "sljackson@boss.com"))));
    } finally {
        datasetFramework.deleteInstance(copyTable);
    }
}

Also used : QueryResult(co.cask.cdap.proto.QueryResult) Table(co.cask.cdap.api.dataset.table.Table) ObjectMappedTable(co.cask.cdap.api.dataset.lib.ObjectMappedTable) ColumnDesc(co.cask.cdap.proto.ColumnDesc) ExploreExecutionResult(co.cask.cdap.explore.client.ExploreExecutionResult) DatasetId(co.cask.cdap.proto.id.DatasetId) Test(org.junit.Test)

Example 28 with ColumnDesc

use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreObjectMappedTableTestRun method testSelectStar.

public void testSelectStar(String tableToQuery, String tableInSchema) throws Exception {
    List<ColumnDesc> expectedSchema = Lists.newArrayList(new ColumnDesc(tableInSchema + ".row_key", "STRING", 1, null), new ColumnDesc(tableInSchema + ".bytearrayfield", "BINARY", 2, null), new ColumnDesc(tableInSchema + ".doublefield", "DOUBLE", 3, null), new ColumnDesc(tableInSchema + ".floatfield", "FLOAT", 4, null), new ColumnDesc(tableInSchema + ".intfield", "INT", 5, null), new ColumnDesc(tableInSchema + ".longfield", "BIGINT", 6, null), new ColumnDesc(tableInSchema + ".stringfield", "STRING", 7, null));
    ExploreExecutionResult results = exploreClient.submit(NAMESPACE_ID, "select * from " + tableToQuery).get();
    // check schema
    Assert.assertEquals(expectedSchema, results.getResultSchema());
    List<Object> columns = results.next().getColumns();
    // check record1
    Assert.assertEquals("123", columns.get(0));
    Assert.assertArrayEquals(record1.byteArrayField, (byte[]) columns.get(1));
    Assert.assertTrue(Math.abs(record1.doubleField - (Double) columns.get(2)) < 0.000001);
    // sigh... why are floats returned as doubles??
    Assert.assertTrue(Math.abs(record1.floatField - (Double) columns.get(3)) < 0.000001);
    Assert.assertEquals(record1.intField, columns.get(4));
    Assert.assertEquals(record1.longField, columns.get(5));
    Assert.assertEquals(record1.stringField, columns.get(6));
    // check record2
    columns = results.next().getColumns();
    Assert.assertEquals("456", columns.get(0));
    Assert.assertArrayEquals(record2.byteArrayField, (byte[]) columns.get(1));
    Assert.assertTrue(Math.abs(record2.doubleField - (Double) columns.get(2)) < 0.000001);
    Assert.assertTrue(Math.abs(record2.floatField - (Double) columns.get(3)) < 0.000001);
    Assert.assertEquals(record2.intField, columns.get(4));
    Assert.assertEquals(record2.longField, columns.get(5));
    Assert.assertEquals(record2.stringField, columns.get(6));
    // should not be any more
    Assert.assertFalse(results.hasNext());
}

Also used : ColumnDesc(co.cask.cdap.proto.ColumnDesc) ExploreExecutionResult(co.cask.cdap.explore.client.ExploreExecutionResult)

Example 29 with ColumnDesc

use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testPartitionedTextSchemaUpdate.

@Test
public void testPartitionedTextSchemaUpdate() throws Exception {
    final DatasetId datasetId = NAMESPACE_ID.dataset("txtschemaupd");
    final String tableName = getDatasetHiveName(datasetId);
    // create a time partitioned file set
    datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("csv").build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(partitioned);
    FileSet fileSet = partitioned.getEmbeddedFileSet();
    // add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
    Location location1 = fileSet.getLocation("file1/nn");
    FileWriterHelper.generateMultiDelimitersFile(location1.getOutputStream(), ImmutableList.of(",", "\1", ":"), 1, 2);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 1).build(), "file1");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
    // verify that we can query the key-values in the file with Hive.
    List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
    new QueryResult(Lists.<Object>newArrayList("1", "x\1x:1", 1))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("str STRING").setExploreFormat("csv").build());
    // new partition should have new schema, validate with query
    expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".str", "STRING", 1, null), new ColumnDesc(tableName + ".number", "INT", 2, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
    new QueryResult(Lists.<Object>newArrayList("1", 1))));
}

Also used : QueryResult(co.cask.cdap.proto.QueryResult) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) ColumnDesc(co.cask.cdap.proto.ColumnDesc) DatasetId(co.cask.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 30 with ColumnDesc

use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testPartitionedTextFileUpdate.

@Test
public void testPartitionedTextFileUpdate() throws Exception {
    final DatasetId datasetId = NAMESPACE_ID.dataset("txtupd");
    final String tableName = getDatasetHiveName(datasetId);
    // create a time partitioned file set
    datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("csv").build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(partitioned);
    FileSet fileSet = partitioned.getEmbeddedFileSet();
    // add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
    Location location1 = fileSet.getLocation("file1/nn");
    FileWriterHelper.generateMultiDelimitersFile(location1.getOutputStream(), ImmutableList.of(",", "\1", ":"), 1, 2);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 1).build(), "file1");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
    // verify that we can query the key-values in the file with Hive.
    List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
    new QueryResult(Lists.<Object>newArrayList("1", "x\1x:1", 1))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").build());
    // add another partition
    Location location2 = fileSet.getLocation("file2/nn");
    FileWriterHelper.generateMultiDelimitersFile(location2.getOutputStream(), ImmutableList.of(",", "\1", ":"), 2, 3);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 2).build(), "file2");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=2", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=2,x value=x:2
    new QueryResult(Lists.<Object>newArrayList("2,x", "x:2", 2))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").setExploreFormatProperty("delimiter", ":").build());
    // add another partition
    Location location3 = fileSet.getLocation("file3/nn");
    FileWriterHelper.generateMultiDelimitersFile(location3.getOutputStream(), ImmutableList.of(",", "\1", ":"), 3, 4);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 3).build(), "file3");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=3", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=3,x\1x value=3
    new QueryResult(Lists.<Object>newArrayList("3,x\1x", "3", 3))));
    // update the dataset properties with a different format (avro)
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // add another partition
    Location location4 = fileSet.getLocation("file4/nn");
    FileWriterHelper.generateAvroFile(location4.getOutputStream(), "x", 4, 5);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 4).build(), "file4");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
    new QueryResult(Lists.<Object>newArrayList("x4", "#4", 4))));
}

Aggregations

ColumnDesc (co.cask.cdap.proto.ColumnDesc)38 QueryResult (co.cask.cdap.proto.QueryResult)23 Test (org.junit.Test)21 DatasetId (co.cask.cdap.proto.id.DatasetId)14 ExploreExecutionResult (co.cask.cdap.explore.client.ExploreExecutionResult)13 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)7 SQLException (java.sql.SQLException)7 Location (org.apache.twill.filesystem.Location)7 FileSet (co.cask.cdap.api.dataset.lib.FileSet)6 Schema (co.cask.cdap.api.data.schema.Schema)4 ExploreClient (co.cask.cdap.explore.client.ExploreClient)4 MockExploreClient (co.cask.cdap.explore.client.MockExploreClient)4 QueryStatus (co.cask.cdap.proto.QueryStatus)4 StreamId (co.cask.cdap.proto.id.StreamId)4 ResultSet (java.sql.ResultSet)4 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)3 PartitionedFileSetProperties (co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties)3 Table (co.cask.cdap.api.dataset.table.Table)3 QueryHandle (co.cask.cdap.proto.QueryHandle)3