use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedFileSet.
private void testPartitionedFileSet(@Nullable String dbName, @Nullable String tableName) throws Exception {
DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parted");
String hiveTableName = getDatasetHiveName(datasetInstanceId);
String showTablesCommand = "show tables";
FileSetProperties.Builder props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addStringField("str").addIntField("num").build()).setBasePath("parted").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString());
if (tableName != null) {
props.setExploreTableName(tableName);
hiveTableName = tableName;
}
String queryTableName = hiveTableName;
if (dbName != null) {
props.setExploreDatabaseName(dbName);
runCommand(NAMESPACE_ID, "create database " + dbName, false, null, null);
showTablesCommand += " in " + dbName;
queryTableName = dbName + "." + queryTableName;
}
// create a time partitioned file set
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, props.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// Accessing dataset instance to perform data operations
final PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
Location locationX1 = fileSet.getLocation("fileX1/nn");
Location locationY1 = fileSet.getLocation("fileY1/nn");
Location locationX2 = fileSet.getLocation("fileX2/nn");
Location locationY2 = fileSet.getLocation("fileY2/nn");
FileWriterHelper.generateAvroFile(locationX1.getOutputStream(), "x", 1, 2);
FileWriterHelper.generateAvroFile(locationY1.getOutputStream(), "y", 1, 2);
FileWriterHelper.generateAvroFile(locationX2.getOutputStream(), "x", 2, 3);
FileWriterHelper.generateAvroFile(locationY2.getOutputStream(), "y", 2, 3);
final PartitionKey keyX1 = PartitionKey.builder().addStringField("str", "x").addIntField("num", 1).build();
PartitionKey keyY1 = PartitionKey.builder().addStringField("str", "y").addIntField("num", 1).build();
final PartitionKey keyX2 = PartitionKey.builder().addStringField("str", "x").addIntField("num", 2).build();
PartitionKey keyY2 = PartitionKey.builder().addStringField("str", "y").addIntField("num", 2).build();
addPartition(partitioned, keyX1, "fileX1");
addPartition(partitioned, keyY1, "fileY1");
addPartition(partitioned, keyX2, "fileX2");
addPartition(partitioned, keyY2, "fileY2");
// verify that the partitions were added to Hive
validatePartitions(queryTableName, partitioned, ImmutableList.of(keyX1, keyX2, keyY1, keyY2));
// verify that count() and where... work in Hive
runCommand(NAMESPACE_ID, "SELECT count(*) AS count FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc("count", "BIGINT", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(4L))));
runCommand(NAMESPACE_ID, "SELECT * FROM " + queryTableName + " WHERE num = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc(hiveTableName + ".key", "STRING", 1, null), new ColumnDesc(hiveTableName + ".value", "STRING", 2, null), new ColumnDesc(hiveTableName + ".str", "STRING", 3, null), new ColumnDesc(hiveTableName + ".num", "INT", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x2", "#2", "x", 2)), new QueryResult(Lists.<Object>newArrayList("y2", "#2", "y", 2))));
// drop a partition and query again
dropPartition(partitioned, keyX2);
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// attempt a transaction that drops one partition, adds another, and then fails
try {
doTransaction(partitioned, new Runnable() {
@Override
public void run() {
partitioned.dropPartition(keyX1);
partitioned.addPartition(keyX2, "fileX2");
Assert.fail("fail tx");
}
});
} catch (TransactionFailureException e) {
// expected
}
// validate that both the drop and addPartition were undone
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// attempt a transaction that attempts to add an existing partition, hence fails
try {
doTransaction(partitioned, new Runnable() {
@Override
public void run() {
partitioned.addPartition(keyX1, "fileX1");
throw new RuntimeException("on purpose");
}
});
} catch (TransactionFailureException e) {
// expected if the cause is not "on purpose"
Assert.assertTrue(e.getCause() instanceof DataSetException);
}
// validate that both the drop and addPartition were undone
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// drop a partition directly from hive
runCommand(NAMESPACE_ID, "ALTER TABLE " + queryTableName + " DROP PARTITION (str='y', num=2)", false, null, null);
// verify that one more value is gone now, namely y2, in Hive, but the PFS still has it
validatePartitionsInHive(queryTableName, ImmutableSet.of(keyX1, keyY1));
validatePartitionsInPFS(partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// make sure the partition can still be dropped from the PFS dataset
dropPartition(partitioned, keyY2);
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1));
// change the explore schema by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
// valudate the schema was updated
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1), true);
// disable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(false).build());
// verify the Hive table is gone
runCommand(NAMESPACE_ID, showTablesCommand, false, null, Collections.<QueryResult>emptyList());
// re-enable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(true).build());
// verify the Hive table is back
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, null, Collections.<QueryResult>emptyList());
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedExisting.
private void testPartitionedExisting(String reuseProperty, boolean possessed) throws Exception {
final DatasetId dummyInstanceId = NAMESPACE_ID.dataset("dummy");
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("tpExisting");
File path = new File(tmpFolder.newFolder(), "base");
String tableName = "reuse";
// create a PFS in order to create a table in Hive and add a partition
// create a time partitioned file set
DatasetProperties props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("key STRING, value INT").setExploreFormat("csv").build();
datasetFramework.addInstance(PartitionedFileSet.class.getName(), dummyInstanceId, props);
PartitionedFileSet dummy = datasetFramework.getDataset(dummyInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(dummy);
Location location = dummy.getEmbeddedFileSet().getLocation("number1").append("file1");
PartitionKey key = PartitionKey.builder().addIntField("number", 1).build();
FileWriterHelper.generateTextFile(location.getOutputStream(), ",", "x", 1, 2);
addPartition(dummy, key, "number1");
// validate data
List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("key STRING, value INT").setExploreFormat("csv").add(reuseProperty, "true").build();
datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetInstanceId, props);
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("k STRING, v INT").setExploreFormat("csv").add(reuseProperty, "true").build();
datasetFramework.updateInstance(datasetInstanceId, props);
// validate data
if (!possessed) {
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
} else {
List<ColumnDesc> newExpectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".k", "STRING", 1, null), new ColumnDesc(tableName + ".v", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, newExpectedColumns, null);
}
datasetFramework.deleteInstance(datasetInstanceId);
if (!possessed) {
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
} else {
runCommand(NAMESPACE_ID, "SHOW tables", false, null, Collections.<QueryResult>emptyList());
}
datasetFramework.deleteInstance(dummyInstanceId);
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedTextFile.
// this tests mainly the support for different text formats. Other features (partitioning etc.) are tested above.
private void testPartitionedTextFile(String name, String format, String delim, String fileDelim) throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset(name);
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
PartitionedFileSetProperties.Builder builder = (PartitionedFileSetProperties.Builder) PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(name).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value INT").setExploreFormat(format);
if (delim != null) {
builder.setExploreFormatProperty("delimiter", delim);
}
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, builder.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
Location location1 = fileSet.getLocation("file1/nn");
FileWriterHelper.generateTextFile(location1.getOutputStream(), fileDelim, "x", 1, 2);
PartitionKey key1 = PartitionKey.builder().addIntField("number", 1).build();
addPartition(partitioned, key1, "file1");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
// drop a partition and query again
dropPartition(partitioned, key1);
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class ExploreMetadataTestRun method testGetColumns.
@Test
public void testGetColumns() throws Exception {
ArrayList<ColumnDesc> expectedColumnDescs = Lists.newArrayList(new ColumnDesc("TABLE_CAT", "STRING", 1, "Catalog name. NULL if not applicable"), new ColumnDesc("TABLE_SCHEM", "STRING", 2, "Schema name"), new ColumnDesc("TABLE_NAME", "STRING", 3, "Table name"), new ColumnDesc("COLUMN_NAME", "STRING", 4, "Column name"), new ColumnDesc("DATA_TYPE", "INT", 5, "SQL type from java.sql.Types"), new ColumnDesc("TYPE_NAME", "STRING", 6, "Data source dependent type name, " + "for a UDT the type name is fully qualified"), new ColumnDesc("COLUMN_SIZE", "INT", 7, "Column size. For char or date types" + " this is the maximum number of characters, for numeric or decimal" + " types this is precision."), new ColumnDesc("BUFFER_LENGTH", "TINYINT", 8, "Unused"), new ColumnDesc("DECIMAL_DIGITS", "INT", 9, "The number of fractional digits"), new ColumnDesc("NUM_PREC_RADIX", "INT", 10, "Radix (typically either 10 or 2)"), new ColumnDesc("NULLABLE", "INT", 11, "Is NULL allowed"), new ColumnDesc("REMARKS", "STRING", 12, "Comment describing column (may be null)"), new ColumnDesc("COLUMN_DEF", "STRING", 13, "Default value (may be null)"), new ColumnDesc("SQL_DATA_TYPE", "INT", 14, "Unused"), new ColumnDesc("SQL_DATETIME_SUB", "INT", 15, "Unused"), new ColumnDesc("CHAR_OCTET_LENGTH", "INT", 16, "For char types the maximum number of bytes in the column"), new ColumnDesc("ORDINAL_POSITION", "INT", 17, "Index of column in table (starting at 1)"), new ColumnDesc("IS_NULLABLE", "STRING", 18, "\"NO\" means column definitely does not " + "allow NULL values; \"YES\" means the column might allow NULL values. " + "An empty string means nobody knows."), new ColumnDesc("SCOPE_CATALOG", "STRING", 19, "Catalog of table that is the scope " + "of a reference attribute (null if DATA_TYPE isn't REF)"), new ColumnDesc("SCOPE_SCHEMA", "STRING", 20, "Schema of table that is the scope of a " + "reference attribute (null if the DATA_TYPE isn't REF)"), new ColumnDesc("SCOPE_TABLE", "STRING", 21, "Table name that this the scope " + "of a reference attribure (null if the DATA_TYPE isn't REF)"), new ColumnDesc("SOURCE_DATA_TYPE", "SMALLINT", 22, "Source type of a distinct type " + "or user-generated Ref type, SQL type from java.sql.Types " + "(null if DATA_TYPE isn't DISTINCT or user-generated REF)"), new ColumnDesc("IS_AUTO_INCREMENT", "STRING", 23, "Indicates whether this column is auto incremented."));
// Get all columns
ListenableFuture<ExploreExecutionResult> future = getExploreClient().columns(null, null, "%", "%");
List<QueryResult> expectedColumns = Lists.newArrayList(getExpectedColumns(NAMESPACE_DATABASE));
expectedColumns.addAll(getExpectedColumns(OTHER_NAMESPACE_DATABASE));
assertStatementResult(future, true, expectedColumnDescs, expectedColumns);
// Get all columns in a namespace
future = getExploreClient().columns(null, OTHER_NAMESPACE_ID.getNamespace(), "%", "%");
assertStatementResult(future, true, expectedColumnDescs, getExpectedColumns(OTHER_NAMESPACE_DATABASE));
}
use of co.cask.cdap.proto.ColumnDesc in project cdap by caskdata.
the class ExploreResultSetTest method testResultSet.
@Test
public void testResultSet() throws Exception {
ExploreClient exploreClient = new MockExploreClient(ImmutableMap.of("mock_query", (List<ColumnDesc>) Lists.newArrayList(new ColumnDesc("column1", "STRING", 1, ""), new ColumnDesc("column2", "int", 2, ""), new ColumnDesc("column3", "char", 3, ""), new ColumnDesc("column4", "float", 4, ""), new ColumnDesc("column5", "double", 5, ""), new ColumnDesc("column6", "boolean", 6, ""), new ColumnDesc("column7", "tinyint", 7, ""), new ColumnDesc("column8", "smallint", 8, ""), new ColumnDesc("column9", "bigint", 9, ""), new ColumnDesc("column10", "date", 10, ""), new ColumnDesc("column11", "timestamp", 11, ""), new ColumnDesc("column12", "decimal", 12, ""), new ColumnDesc("column14", "map<string,string>", 13, ""), new ColumnDesc("column15", "array<string>", 14, ""), new ColumnDesc("column16", "struct<name:string,attr:string>", 15, ""))), ImmutableMap.of("mock_query", (List<QueryResult>) Lists.newArrayList(new QueryResult(ImmutableList.<Object>of("value1", 1, "c", 0.1f, 0.2d, true, 0x1, (short) 2, (long) 10, "2014-06-20", "2014-06-20 07:37:00", "1000000000", "\"{\"key1\":\"value1\"}", "[\"a\",\"b\",\"c\"]", "{\"name\":\"first\",\"attr\":\"second\"}")))));
ResultSet resultSet = new ExploreResultSet(exploreClient.submit(new NamespaceId(ns), "mock_query").get(), new ExploreStatement(null, exploreClient, ns), 0);
Assert.assertTrue(resultSet.next());
Assert.assertEquals(resultSet.getObject(1), resultSet.getObject("column1"));
Assert.assertEquals("value1", resultSet.getString(1));
Assert.assertEquals(1, resultSet.getInt(2));
Assert.assertEquals("c", resultSet.getString(3));
Assert.assertEquals(0.1f, resultSet.getFloat(4), 0.01);
Assert.assertEquals(0.2d, resultSet.getDouble(5), 0.01);
Assert.assertEquals(true, resultSet.getBoolean(6));
Assert.assertEquals(0x1, resultSet.getByte(7));
Assert.assertEquals(2, resultSet.getShort(8));
Assert.assertEquals(10, resultSet.getLong(9));
Assert.assertEquals(Date.valueOf("2014-06-20"), resultSet.getDate(10));
Assert.assertEquals(Timestamp.valueOf("2014-06-20 07:37:00"), resultSet.getTimestamp(11));
Assert.assertEquals(new BigDecimal("1000000000"), resultSet.getBigDecimal(12));
Assert.assertEquals("\"{\"key1\":\"value1\"}", resultSet.getString(13));
Assert.assertEquals("[\"a\",\"b\",\"c\"]", resultSet.getString(14));
Assert.assertEquals("{\"name\":\"first\",\"attr\":\"second\"}", resultSet.getString(15));
Assert.assertFalse(resultSet.next());
Assert.assertFalse(resultSet.next());
try {
resultSet.getObject(1);
} catch (SQLException e) {
// Expected: no more rows
}
}
Aggregations