Search in sources :

Example 1 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testTimePartitionedFileSet.

@Test
public void testTimePartitionedFileSet() throws Exception {
    final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parts");
    final String tableName = getDatasetHiveName(datasetInstanceId);
    // create a time partitioned file set
    datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(tpfs);
    Assert.assertTrue(tpfs instanceof TransactionAware);
    // add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
    long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
    long time2 = DATE_FORMAT.parse("12/10/14 2:00 am").getTime();
    long time3 = DATE_FORMAT.parse("12/10/14 3:00 am").getTime();
    Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
    Location location2 = tpfs.getEmbeddedFileSet().getLocation("file2/nn");
    Location location3 = tpfs.getEmbeddedFileSet().getLocation("file3/nn");
    FileWriterHelper.generateAvroFile(location1.getOutputStream(), "x", 1, 2);
    FileWriterHelper.generateAvroFile(location2.getOutputStream(), "y", 2, 3);
    FileWriterHelper.generateAvroFile(location3.getOutputStream(), "x", 3, 4);
    addTimePartition(tpfs, time1, "file1");
    addTimePartition(tpfs, time2, "file2");
    addTimePartition(tpfs, time3, "file3");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=2/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3")), new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " WHERE hour = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
    // remove a partition
    dropTimePartition(tpfs, time2);
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3"))));
    // verify the partition was removed from Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
    // drop the dataset
    datasetFramework.deleteInstance(datasetInstanceId);
    // verify the Hive table is gone
    runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
    datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
}
Also used : QueryResult(io.cdap.cdap.proto.QueryResult) TransactionAware(org.apache.tephra.TransactionAware) ColumnDesc(io.cdap.cdap.proto.ColumnDesc) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) DatasetId(io.cdap.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 2 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testTPFSWithDateTimestamp.

@Test
public void testTPFSWithDateTimestamp() throws Exception {
    TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
    final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("dtfs");
    final String tableName = getDatasetHiveName(datasetInstanceId);
    final Schema dtSchema = Schema.recordOf("dt", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("dt", Schema.of(Schema.LogicalType.DATE)), Schema.Field.of("ts", Schema.nullableOf(Schema.of(Schema.LogicalType.TIMESTAMP_MILLIS))));
    // create a file set
    datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", dtSchema.toString()).build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(tpfs);
    Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
    generateAvroFile(location1.getOutputStream(), dtSchema);
    // add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
    long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
    addTimePartition(tpfs, time1, "file1");
    // verify that we can query the date and timestamp in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT id, name, dt, ts FROM " + tableName + " LIMIT 50", true, Lists.newArrayList(new ColumnDesc("id", "INT", 1, null), new ColumnDesc("name", "STRING", 2, null), new ColumnDesc("dt", "DATE", 3, null), new ColumnDesc("ts", "TIMESTAMP", 4, null)), Lists.newArrayList(new QueryResult(Lists.newArrayList(1, "alice", "1970-01-01", "2018-09-07 16:09:50.595"))));
    // drop the dataset
    datasetFramework.deleteInstance(datasetInstanceId);
    // verify the Hive table is gone
    runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.emptyList());
    // create a file set
    datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", dtSchema.toString()).build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.newArrayList(tableName))));
}
Also used : QueryResult(io.cdap.cdap.proto.QueryResult) Schema(io.cdap.cdap.api.data.schema.Schema) ColumnDesc(io.cdap.cdap.proto.ColumnDesc) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) DatasetId(io.cdap.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 3 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testAddGetPartitions.

@Test
public void testAddGetPartitions() throws Exception {
    final TimePartitionedFileSet fileSet = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
    TransactionAware txAwareDataset = (TransactionAware) fileSet;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // this is an arbitrary data to use as the test time
            long time = DATE_FORMAT.parse("12/10/14 5:10 am").getTime();
            long time2 = time + HOUR;
            String firstPath = "first/partition";
            String secondPath = "second/partition";
            // make sure the file set has no partitions initially
            validateTimePartition(fileSet, time, null);
            validateTimePartitions(fileSet, 0L, MAX, Collections.<Long, String>emptyMap());
            // add a partition, verify getPartition() works
            fileSet.addPartition(time, firstPath);
            validateTimePartition(fileSet, time, firstPath);
            Map<Long, String> expectNone = Collections.emptyMap();
            Map<Long, String> expectFirst = ImmutableMap.of(time, firstPath);
            Map<Long, String> expectSecond = ImmutableMap.of(time2, secondPath);
            Map<Long, String> expectBoth = ImmutableMap.of(time, firstPath, time2, secondPath);
            // verify various ways to list partitions with various ranges
            validateTimePartitions(fileSet, time + MINUTE, MAX, expectNone);
            validateTimePartitions(fileSet, 0L, time, expectNone);
            validateTimePartitions(fileSet, 0L, MAX, expectFirst);
            validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
            validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
            validateTimePartitions(fileSet, 0L, time + HOUR, expectFirst);
            validateTimePartitions(fileSet, time - HOUR, time + HOUR, expectFirst);
            // add and verify another partition
            fileSet.addPartition(time2, secondPath);
            validateTimePartition(fileSet, time2, secondPath);
            // verify various ways to list partitions with various ranges
            validateTimePartitions(fileSet, 0L, MAX, expectBoth);
            validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectFirst);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
            validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectBoth);
            // try to add another partition with the same key
            try {
                fileSet.addPartition(time2, "third/partition");
                Assert.fail("Should have thrown Exception for duplicate partition");
            } catch (DataSetException e) {
            // expected
            }
            // remove first partition and validate
            fileSet.dropPartition(time);
            validateTimePartition(fileSet, time, null);
            // verify various ways to list partitions with various ranges
            validateTimePartitions(fileSet, 0L, MAX, expectSecond);
            validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectNone);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
            validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
            // try to delete  another partition with the same key
            try {
                fileSet.dropPartition(time);
            } catch (DataSetException e) {
                Assert.fail("Should not have have thrown Exception for removing non-existent partition");
            }
        }
    });
}
Also used : DataSetException(io.cdap.cdap.api.dataset.DataSetException) TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TransactionFailureException(org.apache.tephra.TransactionFailureException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) DataSetException(io.cdap.cdap.api.dataset.DataSetException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) Test(org.junit.Test)

Example 4 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testOutputPartitionPath.

/**
 * Tests that the output file path is set correctly, based on the output partition time.
 */
@Test
public void testOutputPartitionPath() throws Exception {
    // test specifying output time
    Date date = DATE_FORMAT.parse("1/1/15 8:42 pm");
    Map<String, String> args = Maps.newHashMap();
    TimePartitionedFileSetArguments.setOutputPartitionTime(args, date.getTime());
    TimeZone timeZone = Calendar.getInstance().getTimeZone();
    TimePartitionedFileSetArguments.setOutputPathFormat(args, "yyyy-MM-dd/HH_mm", timeZone.getID());
    TimePartitionedFileSet ds = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    String outputPath = ds.getEmbeddedFileSet().getOutputLocation().toURI().getPath();
    Assert.assertTrue(outputPath.endsWith("2015-01-01/20_42"));
    Map<String, String> outputConfig = ds.getOutputFormatConfiguration();
    Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("2015-01-01/20_42"));
    // test specifying output time and partition key -> time should prevail
    PartitionKey key = PartitionKey.builder().addIntField("year", 2014).addIntField("month", 1).addIntField("day", 1).addIntField("hour", 20).addIntField("minute", 54).build();
    TimePartitionedFileSet ds1 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    TimePartitionedFileSetArguments.setOutputPartitionKey(args, key);
    outputConfig = ds1.getOutputFormatConfiguration();
    Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("2015-01-01/20_42"));
    args.clear();
    TimePartitionedFileSetArguments.setOutputPartitionKey(args, key);
    TimePartitionedFileSet ds2 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    outputConfig = ds2.getOutputFormatConfiguration();
    Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("54"));
    args.clear();
    TimePartitionedFileSet ds3 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    try {
        ds3.getOutputFormatConfiguration();
        Assert.fail("getOutputFormatConfiguration should have failed with neither output time nor partition key");
    } catch (DataSetException e) {
    // expected
    }
}
Also used : TimeZone(java.util.TimeZone) DataSetException(io.cdap.cdap.api.dataset.DataSetException) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Date(java.util.Date) Test(org.junit.Test)

Example 5 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testInputConfigurationFailure.

private void testInputConfigurationFailure(Map<String, String> arguments, final String why) throws Exception {
    final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
    TransactionAware txAwareDataset = (TransactionAware) dataset;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            try {
                dataset.getInputFormatConfiguration();
                Assert.fail("getInputFormatConfiguration should fail " + why);
            } catch (Exception e) {
            // expected
            }
        }
    });
}
Also used : TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) TransactionFailureException(org.apache.tephra.TransactionFailureException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) DataSetException(io.cdap.cdap.api.dataset.DataSetException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException)

Aggregations

TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)15 TransactionAware (org.apache.tephra.TransactionAware)9 Test (org.junit.Test)9 DataSetException (io.cdap.cdap.api.dataset.DataSetException)7 TransactionExecutor (org.apache.tephra.TransactionExecutor)7 DatasetManagementException (io.cdap.cdap.api.dataset.DatasetManagementException)6 UnauthorizedException (io.cdap.cdap.security.spi.authorization.UnauthorizedException)6 IOException (java.io.IOException)6 TransactionFailureException (org.apache.tephra.TransactionFailureException)6 ImmutableMap (com.google.common.collect.ImmutableMap)4 Date (java.util.Date)4 Map (java.util.Map)4 Location (org.apache.twill.filesystem.Location)3 TimePartitionDetail (io.cdap.cdap.api.dataset.lib.TimePartitionDetail)2 ColumnDesc (io.cdap.cdap.proto.ColumnDesc)2 QueryResult (io.cdap.cdap.proto.QueryResult)2 DatasetId (io.cdap.cdap.proto.id.DatasetId)2 Schema (io.cdap.cdap.api.data.schema.Schema)1 Partition (io.cdap.cdap.api.dataset.lib.Partition)1 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)1