Search in sources :

Example 11 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testTimePartitionedInputArguments.

@Test
public void testTimePartitionedInputArguments() throws Exception {
    final long time8 = DATE_FORMAT.parse("10/17/2014 8:42 am").getTime();
    final long time9 = DATE_FORMAT.parse("10/17/2014 9:42 am").getTime();
    final String path8 = "8:42";
    final String path9 = "9:42";
    final PartitionFilter filter9 = PartitionFilter.builder().addRangeCondition("hour", 9, null).build();
    // add a few partitions
    {
        final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
        final TransactionAware txAwareDataset = (TransactionAware) dataset;
        dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                dataset.addPartition(time8, path8);
                dataset.addPartition(time9, path9);
            }
        });
    }
    // test specifying time range for input
    Map<String, String> arguments = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 - 30 * MINUTE);
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
    testInputConfiguration(arguments, path8);
    // add a partition filter. it should not have an effect as long as there is a time range
    TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
    testInputConfiguration(arguments, path8);
    // test specifying input with a partition filter
    arguments.clear();
    TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
    testInputConfiguration(arguments, path9);
    // test specifying only a start time or only an end time for input, or none
    arguments.clear();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 + 30 * MINUTE);
    testInputConfigurationFailure(arguments, " with only a start time");
    arguments.clear();
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
    testInputConfigurationFailure(arguments, " with only an end time");
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) TransactionAware(org.apache.tephra.TransactionAware) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Test(org.junit.Test)

Example 12 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method validateInputPaths.

/**
   * Validates that the output configuration of the tpfs, when instantiated with (time - start * minutes) as
   * input start time and (time + end * minutes) as input end time, returns the expected list of paths.
   */
private void validateInputPaths(long time, long start, long end, final String... expected) throws IOException, DatasetManagementException, InterruptedException, TransactionFailureException {
    Map<String, String> arguments = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time + start * MINUTE);
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time + end * MINUTE);
    final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
    TransactionAware txAwareDataset = (TransactionAware) tpfs;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            Map<String, String> inputConfig = tpfs.getInputFormatConfiguration();
            String inputs = inputConfig.get(FileInputFormat.INPUT_DIR);
            Assert.assertNotNull(inputs);
            if (expected.length == 0) {
                Assert.assertTrue(inputs.isEmpty());
                return;
            }
            String[] inputPaths = inputs.split(",");
            Assert.assertEquals(expected.length, inputPaths.length);
            // order is not guaranteed.
            Arrays.sort(expected);
            Arrays.sort(inputPaths);
            for (int i = 0; i < expected.length; i++) {
                // every input path is absolute, whereas expected paths are relative
                Assert.assertTrue("path #" + i + " does not match", inputPaths[i].endsWith(expected[i]));
            }
        }
    });
}
Also used : TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TransactionFailureException(org.apache.tephra.TransactionFailureException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException)

Example 13 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testPartitionMetadata.

@Test
public void testPartitionMetadata() throws Exception {
    final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
    TransactionAware txAware = (TransactionAware) tpfs;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAware).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // make sure the dataset has no partitions
            validateTimePartitions(tpfs, 0L, MAX, Collections.<Long, String>emptyMap());
            Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
            long time = date.getTime();
            // keep track of all the metadata added
            Map<String, String> allMetadata = Maps.newHashMap();
            Map<String, String> metadata = ImmutableMap.of("key1", "value1", "key2", "value3", "key100", "value4");
            tpfs.addPartition(time, "file", metadata);
            allMetadata.putAll(metadata);
            TimePartitionDetail partitionByTime = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partitionByTime);
            Assert.assertEquals(metadata, partitionByTime.getMetadata().asMap());
            tpfs.addMetadata(time, "key3", "value4");
            allMetadata.put("key3", "value4");
            try {
                // attempting to update an existing key throws a DatasetException
                tpfs.addMetadata(time, "key3", "value5");
                Assert.fail("Expected not to be able to update an existing metadata entry");
            } catch (DataSetException expected) {
            }
            Map<String, String> newMetadata = ImmutableMap.of("key4", "value4", "key5", "value5");
            tpfs.addMetadata(time, newMetadata);
            allMetadata.putAll(newMetadata);
            partitionByTime = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partitionByTime);
            Assert.assertEquals(allMetadata, partitionByTime.getMetadata().asMap());
        }
    });
}
Also used : DataSetException(co.cask.cdap.api.dataset.DataSetException) TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionDetail(co.cask.cdap.api.dataset.lib.TimePartitionDetail) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TransactionFailureException(org.apache.tephra.TransactionFailureException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException) Date(java.util.Date) Test(org.junit.Test)

Example 14 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testInputConfiguration.

private void testInputConfiguration(Map<String, String> arguments, final String expectedPath) throws Exception {
    final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
    TransactionAware txAwareDataset = (TransactionAware) dataset;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            Map<String, String> inputConf = dataset.getInputFormatConfiguration();
            String input = inputConf.get(FileInputFormat.INPUT_DIR);
            Assert.assertNotNull(input);
            String[] inputs = input.split(",");
            Assert.assertEquals(1, inputs.length);
            Assert.assertTrue(inputs[0].endsWith(expectedPath));
        }
    });
}
Also used : TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TransactionFailureException(org.apache.tephra.TransactionFailureException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException)

Example 15 with TimePartitionedFileSet

use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testOutputPartitionPath.

/**
   * Tests that the output file path is set correctly, based on the output partition time.
   */
@Test
public void testOutputPartitionPath() throws Exception {
    // test specifying output time
    Date date = DATE_FORMAT.parse("1/1/15 8:42 pm");
    Map<String, String> args = Maps.newHashMap();
    TimePartitionedFileSetArguments.setOutputPartitionTime(args, date.getTime());
    TimeZone timeZone = Calendar.getInstance().getTimeZone();
    TimePartitionedFileSetArguments.setOutputPathFormat(args, "yyyy-MM-dd/HH_mm", timeZone.getID());
    TimePartitionedFileSet ds = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    String outputPath = ds.getEmbeddedFileSet().getOutputLocation().toURI().getPath();
    Assert.assertTrue(outputPath.endsWith("2015-01-01/20_42"));
    Map<String, String> outputConfig = ds.getOutputFormatConfiguration();
    Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("2015-01-01/20_42"));
    // test specifying output time and partition key -> time should prevail
    PartitionKey key = PartitionKey.builder().addIntField("year", 2014).addIntField("month", 1).addIntField("day", 1).addIntField("hour", 20).addIntField("minute", 54).build();
    TimePartitionedFileSet ds1 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    TimePartitionedFileSetArguments.setOutputPartitionKey(args, key);
    outputConfig = ds1.getOutputFormatConfiguration();
    Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("2015-01-01/20_42"));
    args.clear();
    TimePartitionedFileSetArguments.setOutputPartitionKey(args, key);
    TimePartitionedFileSet ds2 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    outputConfig = ds2.getOutputFormatConfiguration();
    Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("54"));
    args.clear();
    TimePartitionedFileSet ds3 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
    try {
        ds3.getOutputFormatConfiguration();
        Assert.fail("getOutputFormatConfiguration should have failed with neither output time nor partition key");
    } catch (DataSetException e) {
    // expected
    }
}
Also used : TimeZone(java.util.TimeZone) DataSetException(co.cask.cdap.api.dataset.DataSetException) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Date(java.util.Date) Test(org.junit.Test)

Aggregations

TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)16 TransactionAware (org.apache.tephra.TransactionAware)9 Test (org.junit.Test)9 DataSetException (co.cask.cdap.api.dataset.DataSetException)7 TransactionExecutor (org.apache.tephra.TransactionExecutor)7 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)6 IOException (java.io.IOException)6 TransactionFailureException (org.apache.tephra.TransactionFailureException)6 ImmutableMap (com.google.common.collect.ImmutableMap)4 Date (java.util.Date)4 Map (java.util.Map)4 TimePartitionDetail (co.cask.cdap.api.dataset.lib.TimePartitionDetail)2 ApplicationManager (co.cask.cdap.test.ApplicationManager)2 Location (org.apache.twill.filesystem.Location)2 Partition (co.cask.cdap.api.dataset.lib.Partition)1 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)1 PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)1 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)1 PartitionOutput (co.cask.cdap.api.dataset.lib.PartitionOutput)1 TimePartitionOutput (co.cask.cdap.api.dataset.lib.TimePartitionOutput)1