use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.
the class ExploreMetadataTestRun method testGetColumns.
@Test
public void testGetColumns() throws Exception {
ArrayList<ColumnDesc> expectedColumnDescs = Lists.newArrayList(new ColumnDesc("TABLE_CAT", "STRING", 1, "Catalog name. NULL if not applicable"), new ColumnDesc("TABLE_SCHEM", "STRING", 2, "Schema name"), new ColumnDesc("TABLE_NAME", "STRING", 3, "Table name"), new ColumnDesc("COLUMN_NAME", "STRING", 4, "Column name"), new ColumnDesc("DATA_TYPE", "INT", 5, "SQL type from java.sql.Types"), new ColumnDesc("TYPE_NAME", "STRING", 6, "Data source dependent type name, " + "for a UDT the type name is fully qualified"), new ColumnDesc("COLUMN_SIZE", "INT", 7, "Column size. For char or date types" + " this is the maximum number of characters, for numeric or decimal" + " types this is precision."), new ColumnDesc("BUFFER_LENGTH", "TINYINT", 8, "Unused"), new ColumnDesc("DECIMAL_DIGITS", "INT", 9, "The number of fractional digits"), new ColumnDesc("NUM_PREC_RADIX", "INT", 10, "Radix (typically either 10 or 2)"), new ColumnDesc("NULLABLE", "INT", 11, "Is NULL allowed"), new ColumnDesc("REMARKS", "STRING", 12, "Comment describing column (may be null)"), new ColumnDesc("COLUMN_DEF", "STRING", 13, "Default value (may be null)"), new ColumnDesc("SQL_DATA_TYPE", "INT", 14, "Unused"), new ColumnDesc("SQL_DATETIME_SUB", "INT", 15, "Unused"), new ColumnDesc("CHAR_OCTET_LENGTH", "INT", 16, "For char types the maximum number of bytes in the column"), new ColumnDesc("ORDINAL_POSITION", "INT", 17, "Index of column in table (starting at 1)"), new ColumnDesc("IS_NULLABLE", "STRING", 18, "\"NO\" means column definitely does not " + "allow NULL values; \"YES\" means the column might allow NULL values. " + "An empty string means nobody knows."), new ColumnDesc("SCOPE_CATALOG", "STRING", 19, "Catalog of table that is the scope " + "of a reference attribute (null if DATA_TYPE isn't REF)"), new ColumnDesc("SCOPE_SCHEMA", "STRING", 20, "Schema of table that is the scope of a " + "reference attribute (null if the DATA_TYPE isn't REF)"), new ColumnDesc("SCOPE_TABLE", "STRING", 21, "Table name that this the scope " + "of a reference attribure (null if the DATA_TYPE isn't REF)"), new ColumnDesc("SOURCE_DATA_TYPE", "SMALLINT", 22, "Source type of a distinct type " + "or user-generated Ref type, SQL type from java.sql.Types " + "(null if DATA_TYPE isn't DISTINCT or user-generated REF)"), new ColumnDesc("IS_AUTO_INCREMENT", "STRING", 23, "Indicates whether this column is auto incremented."));
// Get all columns
ListenableFuture<ExploreExecutionResult> future = getExploreClient().columns(null, null, "%", "%");
List<QueryResult> expectedColumns = Lists.newArrayList(getExpectedColumns(NAMESPACE_DATABASE));
expectedColumns.addAll(getExpectedColumns(OTHER_NAMESPACE_DATABASE));
assertStatementResult(future, true, expectedColumnDescs, expectedColumns);
// Get all columns in a namespace
future = getExploreClient().columns(null, OTHER_NAMESPACE_ID.getNamespace(), "%", "%");
assertStatementResult(future, true, expectedColumnDescs, getExpectedColumns(OTHER_NAMESPACE_DATABASE));
}
use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testTimePartitionedFileSet.
@Test
public void testTimePartitionedFileSet() throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parts");
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(tpfs);
Assert.assertTrue(tpfs instanceof TransactionAware);
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
long time2 = DATE_FORMAT.parse("12/10/14 2:00 am").getTime();
long time3 = DATE_FORMAT.parse("12/10/14 3:00 am").getTime();
Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
Location location2 = tpfs.getEmbeddedFileSet().getLocation("file2/nn");
Location location3 = tpfs.getEmbeddedFileSet().getLocation("file3/nn");
FileWriterHelper.generateAvroFile(location1.getOutputStream(), "x", 1, 2);
FileWriterHelper.generateAvroFile(location2.getOutputStream(), "y", 2, 3);
FileWriterHelper.generateAvroFile(location3.getOutputStream(), "x", 3, 4);
addTimePartition(tpfs, time1, "file1");
addTimePartition(tpfs, time2, "file2");
addTimePartition(tpfs, time3, "file3");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=2/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3")), new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " WHERE hour = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// remove a partition
dropTimePartition(tpfs, time2);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3"))));
// verify the partition was removed from Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
}
use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedTextFile.
// this tests mainly the support for different text formats. Other features (partitioning etc.) are tested above.
private void testPartitionedTextFile(String name, String format, String delim, String fileDelim) throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset(name);
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
PartitionedFileSetProperties.Builder builder = (PartitionedFileSetProperties.Builder) PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(name).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value INT").setExploreFormat(format);
if (delim != null) {
builder.setExploreFormatProperty("delimiter", delim);
}
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, builder.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
Location location1 = fileSet.getLocation("file1/nn");
FileWriterHelper.generateTextFile(location1.getOutputStream(), fileDelim, "x", 1, 2);
PartitionKey key1 = PartitionKey.builder().addIntField("number", 1).build();
addPartition(partitioned, key1, "file1");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
// drop a partition and query again
dropPartition(partitioned, key1);
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}
use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedExisting.
private void testPartitionedExisting(String reuseProperty, boolean possessed) throws Exception {
final DatasetId dummyInstanceId = NAMESPACE_ID.dataset("dummy");
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("tpExisting");
File path = new File(tmpFolder.newFolder(), "base");
String tableName = "reuse";
// create a PFS in order to create a table in Hive and add a partition
// create a time partitioned file set
DatasetProperties props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("key STRING, value INT").setExploreFormat("csv").build();
datasetFramework.addInstance(PartitionedFileSet.class.getName(), dummyInstanceId, props);
PartitionedFileSet dummy = datasetFramework.getDataset(dummyInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(dummy);
Location location = dummy.getEmbeddedFileSet().getLocation("number1").append("file1");
PartitionKey key = PartitionKey.builder().addIntField("number", 1).build();
FileWriterHelper.generateTextFile(location.getOutputStream(), ",", "x", 1, 2);
addPartition(dummy, key, "number1");
// validate data
List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("key STRING, value INT").setExploreFormat("csv").add(reuseProperty, "true").build();
datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetInstanceId, props);
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("k STRING, v INT").setExploreFormat("csv").add(reuseProperty, "true").build();
datasetFramework.updateInstance(datasetInstanceId, props);
// validate data
if (!possessed) {
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
} else {
List<ColumnDesc> newExpectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".k", "STRING", 1, null), new ColumnDesc(tableName + ".v", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, newExpectedColumns, null);
}
datasetFramework.deleteInstance(datasetInstanceId);
if (!possessed) {
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
} else {
runCommand(NAMESPACE_ID, "SHOW tables", false, null, Collections.<QueryResult>emptyList());
}
datasetFramework.deleteInstance(dummyInstanceId);
}
use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testCreateAddAlterDrop.
private void testCreateAddAlterDrop(@Nullable String dbName, @Nullable String tableName) throws Exception {
DatasetId datasetInstanceId = NAMESPACE_ID.dataset("files");
String hiveTableName = getDatasetHiveName(datasetInstanceId);
String showTablesCommand = "show tables";
FileSetProperties.Builder props = FileSetProperties.builder().setBasePath("myPath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString());
if (tableName != null) {
props.setExploreTableName(tableName);
hiveTableName = tableName;
}
String queryTableName = hiveTableName;
if (dbName != null) {
props.setExploreDatabaseName(dbName);
runCommand(NAMESPACE_ID, "create database " + dbName, false, null, null);
showTablesCommand += " in " + dbName;
queryTableName = dbName + "." + queryTableName;
}
// create a time partitioned file set
datasetFramework.addInstance("fileSet", datasetInstanceId, props.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// Accessing dataset instance to perform data operations
FileSet fileSet = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(fileSet);
// add a file
FileWriterHelper.generateAvroFile(fileSet.getLocation("file1").getOutputStream(), "a", 0, 3);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc(hiveTableName + ".key", "STRING", 1, null), new ColumnDesc(hiveTableName + ".value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("a0", "#0")), new QueryResult(Lists.<Object>newArrayList("a1", "#1")), new QueryResult(Lists.<Object>newArrayList("a2", "#2"))));
// add another file
FileWriterHelper.generateAvroFile(fileSet.getLocation("file2").getOutputStream(), "b", 3, 5);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT count(*) AS count FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc("count", "BIGINT", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(5L))));
// disable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(false).build());
// verify the Hive table is gone
runCommand(NAMESPACE_ID, showTablesCommand, false, null, Collections.<QueryResult>emptyList());
// re-enable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(true).build());
// verify that we can query again
runCommand(NAMESPACE_ID, "SELECT count(*) AS count FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc("count", "BIGINT", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(5L))));
// change the explore schema by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + queryTableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(hiveTableName + ".key", "STRING", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("a0")), new QueryResult(Lists.<Object>newArrayList("a1")), new QueryResult(Lists.<Object>newArrayList("a2")), new QueryResult(Lists.<Object>newArrayList("b3")), new QueryResult(Lists.<Object>newArrayList("b4"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, showTablesCommand, false, null, Collections.<QueryResult>emptyList());
// drop the database if needed
if (dbName != null) {
runCommand(NAMESPACE_ID, "drop database " + dbName, false, null, null);
}
}
Aggregations