Examples with TimePartitionedFileSet - io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet

Example 6 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testPartitionMetadata.

@Test
public void testPartitionMetadata() throws Exception {
    final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
    TransactionAware txAware = (TransactionAware) tpfs;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAware).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // make sure the dataset has no partitions
            validateTimePartitions(tpfs, 0L, MAX, Collections.<Long, String>emptyMap());
            Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
            long time = date.getTime();
            // keep track of all the metadata added
            Map<String, String> allMetadata = Maps.newHashMap();
            Map<String, String> metadata = ImmutableMap.of("key1", "value1", "key2", "value3", "key100", "value4");
            tpfs.addPartition(time, "file", metadata);
            allMetadata.putAll(metadata);
            TimePartitionDetail partitionByTime = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partitionByTime);
            Assert.assertEquals(metadata, partitionByTime.getMetadata().asMap());
            tpfs.addMetadata(time, "key3", "value4");
            allMetadata.put("key3", "value4");
            // using the setMetadata API, adding an entry, for a key that already exists will overwrite the previous value
            tpfs.setMetadata(time, Collections.singletonMap("key3", "value5"));
            allMetadata.put("key3", "value5");
            Map<String, String> newMetadata = ImmutableMap.of("key4", "value4", "key5", "value5");
            tpfs.addMetadata(time, newMetadata);
            allMetadata.putAll(newMetadata);
            try {
                // attempting to update an existing key throws a DatasetException
                tpfs.addMetadata(time, "key3", "value5");
                Assert.fail("Expected not to be able to update an existing metadata entry");
            } catch (DataSetException expected) {
            }
            partitionByTime = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partitionByTime);
            Assert.assertEquals(allMetadata, partitionByTime.getMetadata().asMap());
            // remove metadata entries; specifying metadata key that does not exist ('key6') does not cause an error
            tpfs.removeMetadata(time, ImmutableSet.of("key4", "key5", "key6"));
            allMetadata.remove("key4");
            allMetadata.remove("key5");
            partitionByTime = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partitionByTime);
            Assert.assertEquals(allMetadata, partitionByTime.getMetadata().asMap());
        }
    });
}

Also used : DataSetException(io.cdap.cdap.api.dataset.DataSetException) TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionDetail(io.cdap.cdap.api.dataset.lib.TimePartitionDetail) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TransactionFailureException(org.apache.tephra.TransactionFailureException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) DataSetException(io.cdap.cdap.api.dataset.DataSetException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) Date(java.util.Date) Test(org.junit.Test)

Example 7 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method validateInputPaths.

/**
 * Validates that the output configuration of the tpfs, when instantiated with (time - start * minutes) as
 * input start time and (time + end * minutes) as input end time, returns the expected list of paths.
 */
private void validateInputPaths(long time, long start, long end, final String... expected) throws IOException, DatasetManagementException, InterruptedException, TransactionFailureException, UnauthorizedException {
    Map<String, String> arguments = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time + start * MINUTE);
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time + end * MINUTE);
    final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
    TransactionAware txAwareDataset = (TransactionAware) tpfs;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            Map<String, String> inputConfig = tpfs.getInputFormatConfiguration();
            String inputs = inputConfig.get(FileInputFormat.INPUT_DIR);
            Assert.assertNotNull(inputs);
            if (expected.length == 0) {
                Assert.assertTrue(inputs.isEmpty());
                return;
            }
            String[] inputPaths = inputs.split(",");
            Assert.assertEquals(expected.length, inputPaths.length);
            // order is not guaranteed.
            Arrays.sort(expected);
            Arrays.sort(inputPaths);
            for (int i = 0; i < expected.length; i++) {
                // every input path is absolute, whereas expected paths are relative
                Assert.assertTrue("path #" + i + " does not match", inputPaths[i].endsWith(expected[i]));
            }
        }
    });
}

Also used : TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TransactionFailureException(org.apache.tephra.TransactionFailureException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) DataSetException(io.cdap.cdap.api.dataset.DataSetException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException)

Example 8 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class TimePartitionedFileSetTest method testTimePartitionedInputArguments.

@Test
public void testTimePartitionedInputArguments() throws Exception {
    final long time8 = DATE_FORMAT.parse("10/17/2014 8:42 am").getTime();
    final long time9 = DATE_FORMAT.parse("10/17/2014 9:42 am").getTime();
    final String path8 = "8:42";
    final String path9 = "9:42";
    final PartitionFilter filter9 = PartitionFilter.builder().addRangeCondition("hour", 9, null).build();
    // add a few partitions
    {
        final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
        final TransactionAware txAwareDataset = (TransactionAware) dataset;
        dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                dataset.addPartition(time8, path8);
                dataset.addPartition(time9, path9);
            }
        });
    }
    // test specifying time range for input
    Map<String, String> arguments = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 - 30 * MINUTE);
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
    testInputConfiguration(arguments, path8);
    // add a partition filter. it should not have an effect as long as there is a time range
    TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
    testInputConfiguration(arguments, path8);
    // test specifying input with a partition filter
    arguments.clear();
    TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
    testInputConfiguration(arguments, path9);
    // test specifying only a start time or only an end time for input, or none
    arguments.clear();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 + 30 * MINUTE);
    testInputConfigurationFailure(arguments, " with only a start time");
    arguments.clear();
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
    testInputConfigurationFailure(arguments, " with only an end time");
}

Also used : PartitionFilter(io.cdap.cdap.api.dataset.lib.PartitionFilter) TransactionAware(org.apache.tephra.TransactionAware) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Test(org.junit.Test)

Example 9 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class PartitionCorrectorTestRun method createPartition.

private void createPartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long time, int i) throws Exception {
    TimePartitionedFileSet tpfs = tpfsManager.get();
    TimePartitionOutput output = tpfs.getPartitionOutput(time);
    try (PrintStream out = new PrintStream(output.getLocation().append("file").getOutputStream())) {
        out.println(String.format("%d,x%d", i, i));
    }
    output.addPartition();
    tpfsManager.flush();
}

Also used : PrintStream(java.io.PrintStream) TimePartitionOutput(io.cdap.cdap.api.dataset.lib.TimePartitionOutput) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)

Example 10 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithTimePartitionedFileSet.

private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    long customOutputPartitionKey = 123456789L;
    long customInputPartitionKey = 987654321L;
    DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
    long inputTime = System.currentTimeMillis();
    long outputTime = inputTime + TimeUnit.HOURS.toMillis(1);
    addTimePartition(tpfsManager, inputTime);
    addTimePartition(tpfsManager, customInputPartitionKey);
    Map<String, String> inputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100);
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100);
    Map<String, String> outputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs));
    args.put("input", "tpfs");
    args.put("output", "tpfs");
    args.put("outputKey", String.valueOf(customOutputPartitionKey));
    args.put("inputKey", String.valueOf(customInputPartitionKey));
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    tpfsManager.flush();
    TimePartitionedFileSet tpfs = tpfsManager.get();
    PartitionDetail partition = tpfs.getPartitionByTime(outputTime);
    Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition);
    validateFileOutput(partition.getLocation());
    PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey);
    Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition);
    validateFileOutput(customPartition.getLocation());
    // Cleanup after running the test
    tpfs.dropPartition(inputTime);
    tpfs.dropPartition(customInputPartitionKey);
    tpfs.dropPartition(partition.getPartitionKey());
    tpfs.dropPartition(customPartition.getPartitionKey());
    tpfsManager.flush();
}

Also used : SparkManager(io.cdap.cdap.test.SparkManager) HashMap(java.util.HashMap) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Aggregations

TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)51 Test (org.junit.Test)32 ApplicationManager (io.cdap.cdap.test.ApplicationManager)20 TransactionAware (org.apache.tephra.TransactionAware)18 GenericRecord (org.apache.avro.generic.GenericRecord)17 Schema (io.cdap.cdap.api.data.schema.Schema)14 DataSetException (io.cdap.cdap.api.dataset.DataSetException)14 TransactionExecutor (org.apache.tephra.TransactionExecutor)14 IOException (java.io.IOException)13 DatasetManagementException (io.cdap.cdap.api.dataset.DatasetManagementException)12 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)12 UnauthorizedException (io.cdap.cdap.security.spi.authorization.UnauthorizedException)12 TransactionFailureException (org.apache.tephra.TransactionFailureException)12 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)11 Table (io.cdap.cdap.api.dataset.table.Table)10 Location (org.apache.twill.filesystem.Location)10 ImmutableMap (com.google.common.collect.ImmutableMap)8 Date (java.util.Date)8 Map (java.util.Map)8 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)7