Search in sources :

Example 26 with ColumnDesc

use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.

the class BaseHiveExploreService method getStatus.

@Override
public QueryStatus getStatus(QueryHandle handle) throws ExploreException, HandleNotFoundException, SQLException {
    startAndWait();
    InactiveOperationInfo inactiveOperationInfo = inactiveHandleCache.getIfPresent(handle);
    if (inactiveOperationInfo != null) {
        // Operation has been made inactive, so return the saved status.
        LOG.trace("Returning saved status for inactive handle {}", handle);
        return inactiveOperationInfo.getStatus();
    }
    try {
        // Fetch status from Hive
        QueryStatus status = fetchStatus(getActiveOperationInfo(handle));
        LOG.trace("Status of handle {} is {}", handle, status);
        // No results or error, so can be timed out aggressively
        if (status.getStatus() == QueryStatus.OpStatus.FINISHED && !status.hasResults()) {
            // In case of a query that writes to a Dataset, we will always fall into this condition,
            // and timing out aggressively will also close the transaction and make the writes visible
            timeoutAggressively(handle, getResultSchema(handle), status);
        } else if (status.getStatus() == QueryStatus.OpStatus.ERROR) {
            // getResultSchema will fail if the query is in error
            timeoutAggressively(handle, ImmutableList.<ColumnDesc>of(), status);
        }
        return status;
    } catch (HiveSQLException e) {
        throw getSqlException(e);
    }
}
Also used : HiveSQLException(org.apache.hive.service.cli.HiveSQLException) ColumnDesc(io.cdap.cdap.proto.ColumnDesc) QueryStatus(io.cdap.cdap.proto.QueryStatus)

Example 27 with ColumnDesc

use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testPartitionedTextFileUpdate.

@Test
public void testPartitionedTextFileUpdate() throws Exception {
    final DatasetId datasetId = NAMESPACE_ID.dataset("txtupd");
    final String tableName = getDatasetHiveName(datasetId);
    // create a time partitioned file set
    datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("csv").build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(partitioned);
    FileSet fileSet = partitioned.getEmbeddedFileSet();
    // add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
    Location location1 = fileSet.getLocation("file1/nn");
    FileWriterHelper.generateMultiDelimitersFile(location1.getOutputStream(), ImmutableList.of(",", "\1", ":"), 1, 2);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 1).build(), "file1");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
    // verify that we can query the key-values in the file with Hive.
    List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
    new QueryResult(Lists.<Object>newArrayList("1", "x\1x:1", 1))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").build());
    // add another partition
    Location location2 = fileSet.getLocation("file2/nn");
    FileWriterHelper.generateMultiDelimitersFile(location2.getOutputStream(), ImmutableList.of(",", "\1", ":"), 2, 3);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 2).build(), "file2");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=2", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=2,x value=x:2
    new QueryResult(Lists.<Object>newArrayList("2,x", "x:2", 2))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").setExploreFormatProperty("delimiter", ":").build());
    // add another partition
    Location location3 = fileSet.getLocation("file3/nn");
    FileWriterHelper.generateMultiDelimitersFile(location3.getOutputStream(), ImmutableList.of(",", "\1", ":"), 3, 4);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 3).build(), "file3");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=3", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=3,x\1x value=3
    new QueryResult(Lists.<Object>newArrayList("3,x\1x", "3", 3))));
    // update the dataset properties with a different format (avro)
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // add another partition
    Location location4 = fileSet.getLocation("file4/nn");
    FileWriterHelper.generateAvroFile(location4.getOutputStream(), "x", 4, 5);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 4).build(), "file4");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
    new QueryResult(Lists.<Object>newArrayList("x4", "#4", 4))));
}
Also used : QueryResult(io.cdap.cdap.proto.QueryResult) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) ColumnDesc(io.cdap.cdap.proto.ColumnDesc) DatasetId(io.cdap.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 28 with ColumnDesc

use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testOrcFileset.

@Test
public void testOrcFileset() throws Exception {
    final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("orcfiles");
    final String tableName = getDatasetHiveName(datasetInstanceId);
    // create a time partitioned file set
    datasetFramework.addInstance("fileSet", datasetInstanceId, FileSetProperties.builder().setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("id int, name string").build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // insert data into the table
    ExploreExecutionResult result = exploreClient.submit(NAMESPACE_ID, String.format("insert into table %s values (1, 'samuel'), (2, 'dwayne')", tableName)).get();
    result.close();
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, Lists.newArrayList(new ColumnDesc(tableName + ".id", "INT", 1, null), new ColumnDesc(tableName + ".name", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(1, "samuel")), new QueryResult(Lists.<Object>newArrayList(2, "dwayne"))));
    // drop the dataset
    datasetFramework.deleteInstance(datasetInstanceId);
    // verify the Hive table is gone
    runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}
Also used : QueryResult(io.cdap.cdap.proto.QueryResult) ColumnDesc(io.cdap.cdap.proto.ColumnDesc) ExploreExecutionResult(io.cdap.cdap.explore.client.ExploreExecutionResult) DatasetId(io.cdap.cdap.proto.id.DatasetId) Test(org.junit.Test)

Example 29 with ColumnDesc

use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method validatePartitionsInHive.

private void validatePartitionsInHive(String tableName, final Collection<PartitionKey> expected, boolean keyOnly) throws Exception {
    // validate the partitions
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), partitionKeys2PartitionResults(expected));
    // and validate the contents of the partitions
    runCommand(NAMESPACE_ID, "SELECT key" + (keyOnly ? "" : ",value") + " FROM " + tableName + " ORDER BY key" + (keyOnly ? "" : ",value"), true, keyOnly ? Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null)) : Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), partitionKeys2QueryResults(expected, keyOnly));
}
Also used : ColumnDesc(io.cdap.cdap.proto.ColumnDesc)

Example 30 with ColumnDesc

use of io.cdap.cdap.proto.ColumnDesc in project cdap by caskdata.

the class HiveExploreServiceTestRun method testJoin.

@Test
public void testJoin() throws Exception {
    DatasetId myTable1 = NAMESPACE_ID.dataset("my_table_1");
    String myTable1Name = getDatasetHiveName(myTable1);
    // Performing admin operations to create dataset instance
    datasetFramework.addInstance("keyStructValueTable", myTable1, DatasetProperties.EMPTY);
    try {
        Transaction tx1 = transactionManager.startShort(100);
        // Accessing dataset instance to perform data operations
        KeyStructValueTableDefinition.KeyStructValueTable table = datasetFramework.getDataset(myTable1, DatasetDefinition.NO_ARGUMENTS, null);
        Assert.assertNotNull(table);
        table.startTx(tx1);
        KeyValue.Value value1 = new KeyValue.Value("two", Lists.newArrayList(20, 21, 22, 23, 24));
        KeyValue.Value value2 = new KeyValue.Value("third", Lists.newArrayList(30, 31, 32, 33, 34));
        table.put("2", value1);
        table.put("3", value2);
        Assert.assertEquals(value1, table.get("2"));
        Assert.assertTrue(table.commitTx());
        transactionManager.canCommit(tx1.getTransactionId(), table.getTxChanges());
        transactionManager.commit(tx1.getTransactionId(), tx1.getWritePointer());
        table.postTxCommit();
        String query = String.format("select %s.key, %s.value from %s join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
        runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}"))));
        query = String.format("select %s.key, %s.value, %s.key, %s.value " + "from %s right outer join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, myTable1Name, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
        runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null), new ColumnDesc(myTable1Name + ".key", "STRING", 3, null), new ColumnDesc(myTable1Name + ".value", "struct<name:string,ints:array<int>>", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}", "2", "{\"name\":\"two\",\"ints\":[20,21,22,23,24]}")), new QueryResult(Lists.<Object>newArrayList(null, null, "3", "{\"name\":\"third\",\"ints\":[30,31,32,33,34]}"))));
        query = String.format("select %s.key, %s.value, %s.key, %s.value from %s " + "left outer join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, myTable1Name, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
        runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null), new ColumnDesc(myTable1Name + ".key", "STRING", 3, null), new ColumnDesc(myTable1Name + ".value", "struct<name:string,ints:array<int>>", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("1", "{\"name\":\"first\",\"ints\":[1,2,3,4,5]}", null, null)), new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}", "2", "{\"name\":\"two\",\"ints\":[20,21,22,23,24]}"))));
        query = String.format("select %s.key, %s.value, %s.key, %s.value from %s " + "full outer join %s on (%s.key=%s.key)", MY_TABLE_NAME, MY_TABLE_NAME, myTable1Name, myTable1Name, MY_TABLE_NAME, myTable1Name, MY_TABLE_NAME, myTable1Name);
        runCommand(NAMESPACE_ID, query, true, Lists.newArrayList(new ColumnDesc(MY_TABLE_NAME + ".key", "STRING", 1, null), new ColumnDesc(MY_TABLE_NAME + ".value", "struct<name:string,ints:array<int>>", 2, null), new ColumnDesc(myTable1Name + ".key", "STRING", 3, null), new ColumnDesc(myTable1Name + ".value", "struct<name:string,ints:array<int>>", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("1", "{\"name\":\"first\",\"ints\":[1,2,3,4,5]}", null, null)), new QueryResult(Lists.<Object>newArrayList("2", "{\"name\":\"two\",\"ints\":[10,11,12,13,14]}", "2", "{\"name\":\"two\",\"ints\":[20,21,22,23,24]}")), new QueryResult(Lists.<Object>newArrayList(null, null, "3", "{\"name\":\"third\",\"ints\":[30,31,32,33,34]}"))));
    } finally {
        datasetFramework.deleteInstance(myTable1);
    }
}
Also used : QueryResult(io.cdap.cdap.proto.QueryResult) KeyValue(io.cdap.cdap.explore.service.datasets.KeyStructValueTableDefinition.KeyValue) Transaction(org.apache.tephra.Transaction) KeyStructValueTableDefinition(io.cdap.cdap.explore.service.datasets.KeyStructValueTableDefinition) KeyValue(io.cdap.cdap.explore.service.datasets.KeyStructValueTableDefinition.KeyValue) ColumnDesc(io.cdap.cdap.proto.ColumnDesc) DatasetId(io.cdap.cdap.proto.id.DatasetId) Test(org.junit.Test)

Aggregations

ColumnDesc (io.cdap.cdap.proto.ColumnDesc)36 QueryResult (io.cdap.cdap.proto.QueryResult)23 Test (org.junit.Test)21 DatasetId (io.cdap.cdap.proto.id.DatasetId)16 ExploreExecutionResult (io.cdap.cdap.explore.client.ExploreExecutionResult)12 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)9 Location (org.apache.twill.filesystem.Location)8 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)7 SQLException (java.sql.SQLException)7 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)6 Schema (io.cdap.cdap.api.data.schema.Schema)4 Table (io.cdap.cdap.api.dataset.table.Table)4 ExploreClient (io.cdap.cdap.explore.client.ExploreClient)4 MockExploreClient (io.cdap.cdap.explore.client.MockExploreClient)4 QueryStatus (io.cdap.cdap.proto.QueryStatus)4 ResultSet (java.sql.ResultSet)4 ImmutableList (com.google.common.collect.ImmutableList)3 PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)3 PartitionedFileSetProperties (io.cdap.cdap.api.dataset.lib.PartitionedFileSetProperties)3 QueryHandle (io.cdap.cdap.proto.QueryHandle)3