Search in sources :

Example 1 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testInputConfigurationFailure.

private void testInputConfigurationFailure(Map<String, String> arguments, final String why) throws Exception {
    final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
    TransactionAware txAwareDataset = (TransactionAware) dataset;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            try {
                dataset.getInputFormatConfiguration();
                Assert.fail("getInputFormatConfiguration should fail " + why);
            } catch (Exception e) {
            // expected
            }
        }
    });
}
Also used : TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) TransactionFailureException(org.apache.tephra.TransactionFailureException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException)

Example 2 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testInputPartitionPaths.

/**
   * Tests that the TPFS sets the file input paths correctly for the input time range.
   */
@Test
public void testInputPartitionPaths() throws Exception {
    // make sure the dataset has no partitions
    final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
    TransactionAware txAwareDataset = (TransactionAware) tpfs;
    TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset);
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            validateTimePartitions(tpfs, 0L, MAX, Collections.<Long, String>emptyMap());
        }
    });
    Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
    final long time = date.getTime();
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            tpfs.addPartition(time, "file");
            tpfs.addPartition(time + 5 * MINUTE, "file5");
            tpfs.addPartition(time + 10 * MINUTE, "file10");
            tpfs.addPartition(time + 12 * MINUTE, "file12");
        }
    });
    validateInputPaths(time, -10, -5);
    validateInputPaths(time, -10, 2, "file");
    validateInputPaths(time, 1, 11, "file5", "file10");
    validateInputPaths(time, 1, 15, "file5", "file10", "file12");
    validateInputPaths(time, 5, 10, "file5");
}
Also used : TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) TransactionFailureException(org.apache.tephra.TransactionFailureException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException) Date(java.util.Date) Test(org.junit.Test)

Example 3 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testAddGetPartitions.

@Test
public void testAddGetPartitions() throws Exception {
    final TimePartitionedFileSet fileSet = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
    TransactionAware txAwareDataset = (TransactionAware) fileSet;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // this is an arbitrary data to use as the test time
            long time = DATE_FORMAT.parse("12/10/14 5:10 am").getTime();
            long time2 = time + HOUR;
            String firstPath = "first/partition";
            String secondPath = "second/partition";
            // make sure the file set has no partitions initially
            validateTimePartition(fileSet, time, null);
            validateTimePartitions(fileSet, 0L, MAX, Collections.<Long, String>emptyMap());
            // add a partition, verify getPartition() works
            fileSet.addPartition(time, firstPath);
            validateTimePartition(fileSet, time, firstPath);
            Map<Long, String> expectNone = Collections.emptyMap();
            Map<Long, String> expectFirst = ImmutableMap.of(time, firstPath);
            Map<Long, String> expectSecond = ImmutableMap.of(time2, secondPath);
            Map<Long, String> expectBoth = ImmutableMap.of(time, firstPath, time2, secondPath);
            // verify various ways to list partitions with various ranges
            validateTimePartitions(fileSet, time + MINUTE, MAX, expectNone);
            validateTimePartitions(fileSet, 0L, time, expectNone);
            validateTimePartitions(fileSet, 0L, MAX, expectFirst);
            validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
            validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
            validateTimePartitions(fileSet, 0L, time + HOUR, expectFirst);
            validateTimePartitions(fileSet, time - HOUR, time + HOUR, expectFirst);
            // add and verify another partition
            fileSet.addPartition(time2, secondPath);
            validateTimePartition(fileSet, time2, secondPath);
            // verify various ways to list partitions with various ranges
            validateTimePartitions(fileSet, 0L, MAX, expectBoth);
            validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectFirst);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
            validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectBoth);
            // try to add another partition with the same key
            try {
                fileSet.addPartition(time2, "third/partition");
                Assert.fail("Should have thrown Exception for duplicate partition");
            } catch (DataSetException e) {
            //expected
            }
            // remove first partition and validate
            fileSet.dropPartition(time);
            validateTimePartition(fileSet, time, null);
            // verify various ways to list partitions with various ranges
            validateTimePartitions(fileSet, 0L, MAX, expectSecond);
            validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectNone);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
            validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
            validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
            // try to delete  another partition with the same key
            try {
                fileSet.dropPartition(time);
            } catch (DataSetException e) {
                Assert.fail("Should not have have thrown Exception for removing non-existent partition");
            }
        }
    });
}
Also used : DataSetException(co.cask.cdap.api.dataset.DataSetException) TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TransactionFailureException(org.apache.tephra.TransactionFailureException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException) Test(org.junit.Test)

Example 4 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testTimePartitionedFileSet.

@Test
public void testTimePartitionedFileSet() throws Exception {
    final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parts");
    final String tableName = getDatasetHiveName(datasetInstanceId);
    // create a time partitioned file set
    datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(tpfs);
    Assert.assertTrue(tpfs instanceof TransactionAware);
    // add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
    long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
    long time2 = DATE_FORMAT.parse("12/10/14 2:00 am").getTime();
    long time3 = DATE_FORMAT.parse("12/10/14 3:00 am").getTime();
    Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
    Location location2 = tpfs.getEmbeddedFileSet().getLocation("file2/nn");
    Location location3 = tpfs.getEmbeddedFileSet().getLocation("file3/nn");
    FileWriterHelper.generateAvroFile(location1.getOutputStream(), "x", 1, 2);
    FileWriterHelper.generateAvroFile(location2.getOutputStream(), "y", 2, 3);
    FileWriterHelper.generateAvroFile(location3.getOutputStream(), "x", 3, 4);
    addTimePartition(tpfs, time1, "file1");
    addTimePartition(tpfs, time2, "file2");
    addTimePartition(tpfs, time3, "file3");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=2/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3")), new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " WHERE hour = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
    // remove a partition
    dropTimePartition(tpfs, time2);
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3"))));
    // verify the partition was removed from Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
    // drop the dataset
    datasetFramework.deleteInstance(datasetInstanceId);
    // verify the Hive table is gone
    runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
    datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
}
Also used : QueryResult(co.cask.cdap.proto.QueryResult) TransactionAware(org.apache.tephra.TransactionAware) ColumnDesc(co.cask.cdap.proto.ColumnDesc) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) DatasetId(co.cask.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 5 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class MapReduceWithPartitionedTest method testTimePartitionedWithMR.

@Test
public void testTimePartitionedWithMR() throws Exception {
    final ApplicationWithPrograms app = deployApp(AppWithTimePartitionedFileSet.class);
    // write a value to the input table
    final Table table = datasetCache.getDataset(AppWithTimePartitionedFileSet.INPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.put(Bytes.toBytes("x"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
        }
    });
    final long time = DATE_FORMAT.parse("1/15/15 11:15 am").getTime();
    final long time5 = time + TimeUnit.MINUTES.toMillis(5);
    // run the partition writer m/r with this output partition time
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time);
    final ImmutableMap<String, String> assignedMetadata = ImmutableMap.of("region", "13", "data.source.name", "input", "data.source.type", "table");
    TimePartitionedFileSetArguments.setOutputPartitionMetadata(outputArgs, assignedMetadata);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    final TimePartitionedFileSet tpfs = datasetCache.getDataset(TIME_PARTITIONED);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            TimePartitionDetail partition = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("2015-01-15/11-15"));
            Assert.assertEquals(assignedMetadata, partition.getMetadata().asMap());
        }
    });
    // delete the data in the input table and write a new row
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.delete(Bytes.toBytes("x"));
            table.put(Bytes.toBytes("y"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
        }
    });
    // now run the m/r again with a new partition time, say 5 minutes later
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time5);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
    // make the mapreduce add the partition in destroy, to validate that this does not fail the job
    runtimeArguments.put(AppWithTimePartitionedFileSet.COMPAT_ADD_PARTITION, "true");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = tpfs.getPartitionByTime(time5);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("2015-01-15/11-20"));
        }
    });
    // now run a map/reduce that reads all the partitions
    runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time5 + TimeUnit.MINUTES.toMillis(5));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "a");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read both partitions - and written both x and y to row a
    final Table output = datasetCache.getDataset(AppWithTimePartitionedFileSet.OUTPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("a"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("2", row.getString("y"));
        }
    });
    // now run a map/reduce that reads a range of the partitions, namely the first one
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time + TimeUnit.MINUTES.toMillis(2));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "b");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read the first partition only - and written only x to row b
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("b"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertNull(row.get("y"));
        }
    });
    // now run a map/reduce that reads no partitions (because the range matches nothing)
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(10));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time - TimeUnit.MINUTES.toMillis(9));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "n");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read no partitions - and written nothing to row n
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("n"));
            Assert.assertTrue(row.isEmpty());
        }
    });
}
Also used : Partition(co.cask.cdap.api.dataset.lib.Partition) Table(co.cask.cdap.api.dataset.table.Table) TransactionExecutor(org.apache.tephra.TransactionExecutor) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) TransactionAware(org.apache.tephra.TransactionAware) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) TimePartitionDetail(co.cask.cdap.api.dataset.lib.TimePartitionDetail) Row(co.cask.cdap.api.dataset.table.Row) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Test(org.junit.Test)

Aggregations

TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)16 TransactionAware (org.apache.tephra.TransactionAware)9 Test (org.junit.Test)9 DataSetException (co.cask.cdap.api.dataset.DataSetException)7 TransactionExecutor (org.apache.tephra.TransactionExecutor)7 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)6 IOException (java.io.IOException)6 TransactionFailureException (org.apache.tephra.TransactionFailureException)6 ImmutableMap (com.google.common.collect.ImmutableMap)4 Date (java.util.Date)4 Map (java.util.Map)4 TimePartitionDetail (co.cask.cdap.api.dataset.lib.TimePartitionDetail)2 ApplicationManager (co.cask.cdap.test.ApplicationManager)2 Location (org.apache.twill.filesystem.Location)2 Partition (co.cask.cdap.api.dataset.lib.Partition)1 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)1 PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)1 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)1 PartitionOutput (co.cask.cdap.api.dataset.lib.PartitionOutput)1 TimePartitionOutput (co.cask.cdap.api.dataset.lib.TimePartitionOutput)1