use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method validatePartitionsInHive.
private void validatePartitionsInHive(String tableName, final Collection<PartitionKey> expected, boolean keyOnly) throws Exception {
// validate the partitions
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), partitionKeys2PartitionResults(expected));
// and validate the contents of the partitions
runCommand(NAMESPACE_ID, "SELECT key" + (keyOnly ? "" : ",value") + " FROM " + tableName + " ORDER BY key" + (keyOnly ? "" : ",value"), true, keyOnly ? Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null)) : Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), partitionKeys2QueryResults(expected, keyOnly));
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testOrcFileset.
@Test
public void testOrcFileset() throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("orcfiles");
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
datasetFramework.addInstance("fileSet", datasetInstanceId, FileSetProperties.builder().setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("id int, name string").build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// insert data into the table
ExploreExecutionResult result = exploreClient.submit(NAMESPACE_ID, String.format("insert into table %s values (1, 'samuel'), (2, 'dwayne')", tableName)).get();
result.close();
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, Lists.newArrayList(new ColumnDesc(tableName + ".id", "INT", 1, null), new ColumnDesc(tableName + ".name", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(1, "samuel")), new QueryResult(Lists.<Object>newArrayList(2, "dwayne"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testTimePartitionedFileSet.
@Test
public void testTimePartitionedFileSet() throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parts");
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(tpfs);
Assert.assertTrue(tpfs instanceof TransactionAware);
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
long time2 = DATE_FORMAT.parse("12/10/14 2:00 am").getTime();
long time3 = DATE_FORMAT.parse("12/10/14 3:00 am").getTime();
Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
Location location2 = tpfs.getEmbeddedFileSet().getLocation("file2/nn");
Location location3 = tpfs.getEmbeddedFileSet().getLocation("file3/nn");
FileWriterHelper.generateAvroFile(location1.getOutputStream(), "x", 1, 2);
FileWriterHelper.generateAvroFile(location2.getOutputStream(), "y", 2, 3);
FileWriterHelper.generateAvroFile(location3.getOutputStream(), "x", 3, 4);
addTimePartition(tpfs, time1, "file1");
addTimePartition(tpfs, time2, "file2");
addTimePartition(tpfs, time3, "file3");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=2/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3")), new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " WHERE hour = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// remove a partition
dropTimePartition(tpfs, time2);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3"))));
// verify the partition was removed from Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceTestRun method testJoin.
@Test
public void testJoin() throws Exception {
DatasetId myTable1 = NAMESPACE_ID.dataset("my_table_1");
String myTable1Name = getDatasetHiveName(myTable1);
// Performing admin operations to create dataset instance
datasetFramework.addInstance("keyStructValueTable", myTable1, DatasetProperties.EMPTY);
try {
Transaction tx1 = transactionManager.startShort(100);
// Accessing dataset instance to perform data operations
KeyStructValueTableDefinition.KeyStructValueTable table = datasetFramework.getDataset(myTable1, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(table);
table.startTx(tx1);
KeyValue.Value value1 = new KeyValue.Value("two", Lists.newArrayList(20, 21, 22, 23, 24));
KeyValue.Value value2 = new KeyValue.Value("third", Lists.newArrayList(30, 31, 32, 33, 34));
table.put("2", value1);
table.put("3", value2);
Assert.assertEquals(value1, table.get("2"));
Assert.assertTrue(table.commitTx());
transactionManager.canCommit(tx1.getTransactionId(), table.getTxChanges());
transactionManager.commit(tx1.getTransactionId(), tx1.getWritePointer());
table.postTxCommit();
String query = String.format("select %s.key, %s.value from %s join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}"))));
query = String.format("select %s.key, %s.value, %s.key, %s.value " + "from %s right outer join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, myTable1Name, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null), new ColumnDesc(myTable1Name + ".key", "STRING", 3, null), new ColumnDesc(myTable1Name + ".value", "struct<name:string,ints:array<int>>", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}", "2", "{\"name\":\"two\",\"ints\":[20,21,22,23,24]}")), new QueryResult(Lists.<Object>newArrayList(null, null, "3", "{\"name\":\"third\",\"ints\":[30,31,32,33,34]}"))));
query = String.format("select %s.key, %s.value, %s.key, %s.value from %s " + "left outer join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, myTable1Name, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null), new ColumnDesc(myTable1Name + ".key", "STRING", 3, null), new ColumnDesc(myTable1Name + ".value", "struct<name:string,ints:array<int>>", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("1", "{\"name\":\"first\",\"ints\":[1,2,3,4,5]}", null, null)), new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}", "2", "{\"name\":\"two\",\"ints\":[20,21,22,23,24]}"))));
query = String.format("select %s.key, %s.value, %s.key, %s.value from %s " + "full outer join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, myTable1Name, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null), new ColumnDesc(myTable1Name + ".key", "STRING", 3, null), new ColumnDesc(myTable1Name + ".value", "struct<name:string,ints:array<int>>", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("1", "{\"name\":\"first\",\"ints\":[1,2,3,4,5]}", null, null)), new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}", "2", "{\"name\":\"two\",\"ints\":[20,21,22,23,24]}")), new QueryResult(Lists.<Object>newArrayList(null, null, "3", "{\"name\":\"third\",\"ints\":[30,31,32,33,34]}"))));
} finally {
datasetFramework.deleteInstance(myTable1);
}
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceStreamTest method testStreamNameWithHyphen.
@Test
public void testStreamNameWithHyphen() throws Exception {
StreamId streamId = NAMESPACE_ID.stream("stream-test");
grantAndAssertSuccess(streamId, USER, EnumSet.allOf(Action.class));
createStream(streamId);
try {
sendStreamEvent(streamId, Collections.<String, String>emptyMap(), Bytes.toBytes("Dummy"));
// Streams with '-' are replaced with '_'
String cleanStreamName = "stream_test";
runCommand(NAMESPACE_ID, "select body from " + getTableName(cleanStreamName), true, Lists.newArrayList(new ColumnDesc("body", "STRING", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("Dummy"))));
} finally {
dropStream(streamId);
}
}
Aggregations