Examples with TimePartitionedFileSet - io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet

Example 11 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.

the class PartitionCorrectorTestRun method testPartitionCorrector.

@Test
public void testPartitionCorrector() throws Exception {
    ApplicationManager appManager = deployApplication(PartitionExploreCorrectorTestApp.class);
    final int numPartitions = 10;
    addDatasetInstance(TimePartitionedFileSet.class.getName(), "tpfs", PartitionedFileSetProperties.builder().setExploreFormat("csv").setExploreSchema("key int, value string").setEnableExploreOnCreate(true).build());
    DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
    Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
    long baseTime = date.getTime();
    for (int i = 0; i < numPartitions; i++) {
        createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
    }
    validateAllPartitions(numPartitions);
    dropAllPartitions();
    validateAllPartitions(0);
    // all partitions are missing. drop/recrete Hive table and add all partitions
    WorkerManager workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "5", "verbose", "true"));
    workerManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
    validateAllPartitions(numPartitions);
    dropAllPartitions();
    for (int i = numPartitions; i < 2 * numPartitions; i++) {
        createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
    }
    validateAllPartitions(numPartitions);
    // some partitions are missing, some present keep the Hive table and try to add all partitions
    workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "8", "verbose", "false", "disable.explore", "false"));
    workerManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 60, TimeUnit.SECONDS);
    validateAllPartitions(2 * numPartitions);
}

Also used : WorkerManager(io.cdap.cdap.test.WorkerManager) ApplicationManager(io.cdap.cdap.test.ApplicationManager) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Date(java.util.Date) Test(org.junit.Test)

Example 12 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.

the class SparkFileSetTestRun method addTimePartition.

private void addTimePartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long inputTime) throws IOException, TransactionFailureException, InterruptedException {
    TimePartitionedFileSet tpfs = tpfsManager.get();
    PartitionOutput partitionOutput = tpfs.getPartitionOutput(inputTime);
    Location location = partitionOutput.getLocation();
    prepareFileInput(location);
    partitionOutput.addPartition();
    tpfsManager.flush();
}

Also used : PartitionOutput(io.cdap.cdap.api.dataset.lib.PartitionOutput) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Location(org.apache.twill.filesystem.Location)

Example 13 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.

the class SparkFileSetTestRun method testSparkWithTimePartitionedFileSet.

private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    long customOutputPartitionKey = 123456789L;
    long customInputPartitionKey = 987654321L;
    DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
    long inputTime = System.currentTimeMillis();
    long outputTime = inputTime + TimeUnit.HOURS.toMillis(1);
    addTimePartition(tpfsManager, inputTime);
    addTimePartition(tpfsManager, customInputPartitionKey);
    Map<String, String> inputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100);
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100);
    Map<String, String> outputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs));
    args.put("input", "tpfs");
    args.put("output", "tpfs");
    args.put("outputKey", String.valueOf(customOutputPartitionKey));
    args.put("inputKey", String.valueOf(customInputPartitionKey));
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    tpfsManager.flush();
    TimePartitionedFileSet tpfs = tpfsManager.get();
    PartitionDetail partition = tpfs.getPartitionByTime(outputTime);
    Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition);
    validateFileOutput(partition.getLocation());
    PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey);
    Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition);
    validateFileOutput(customPartition.getLocation());
    // Cleanup after running the test
    tpfs.dropPartition(inputTime);
    tpfs.dropPartition(customInputPartitionKey);
    tpfs.dropPartition(partition.getPartitionKey());
    tpfs.dropPartition(customPartition.getPartitionKey());
    tpfsManager.flush();
}

Also used : SparkManager(io.cdap.cdap.test.SparkManager) HashMap(java.util.HashMap) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Example 14 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.

the class PartitionCorrectorTestRun method testPartitionCorrector.

@Test
public void testPartitionCorrector() throws Exception {
    ApplicationManager appManager = deployApplication(PartitionExploreCorrectorTestApp.class);
    final int numPartitions = 10;
    addDatasetInstance(TimePartitionedFileSet.class.getName(), "tpfs", PartitionedFileSetProperties.builder().setExploreFormat("csv").setExploreSchema("key int, value string").setEnableExploreOnCreate(true).build());
    DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
    Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
    long baseTime = date.getTime();
    for (int i = 0; i < numPartitions; i++) {
        createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
    }
    validateAllPartitions(numPartitions);
    dropAllPartitions();
    validateAllPartitions(0);
    // all partitions are missing. drop/recrete Hive table and add all partitions
    WorkerManager workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "5", "verbose", "true"));
    workerManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
    validateAllPartitions(numPartitions);
    dropAllPartitions();
    for (int i = numPartitions; i < 2 * numPartitions; i++) {
        createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
    }
    validateAllPartitions(numPartitions);
    // some partitions are missing, some present keep the Hive table and try to add all partitions
    workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "8", "verbose", "false", "disable.explore", "false"));
    workerManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 60, TimeUnit.SECONDS);
    validateAllPartitions(2 * numPartitions);
}

Example 15 with TimePartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.

the class TimePartitionedFileSetTest method testInputPartitionPaths.

/**
 * Tests that the TPFS sets the file input paths correctly for the input time range.
 */
@Test
public void testInputPartitionPaths() throws Exception {
    // make sure the dataset has no partitions
    final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
    TransactionAware txAwareDataset = (TransactionAware) tpfs;
    TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset);
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            validateTimePartitions(tpfs, 0L, MAX, Collections.<Long, String>emptyMap());
        }
    });
    Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
    final long time = date.getTime();
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            tpfs.addPartition(time, "file");
            tpfs.addPartition(time + 5 * MINUTE, "file5");
            tpfs.addPartition(time + 10 * MINUTE, "file10");
            tpfs.addPartition(time + 12 * MINUTE, "file12");
        }
    });
    validateInputPaths(time, -10, -5);
    validateInputPaths(time, -10, 2, "file");
    validateInputPaths(time, 1, 11, "file5", "file10");
    validateInputPaths(time, 1, 15, "file5", "file10", "file12");
    validateInputPaths(time, 5, 10, "file5");
}

Also used : TransactionAware(org.apache.tephra.TransactionAware) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) TransactionFailureException(org.apache.tephra.TransactionFailureException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) DataSetException(io.cdap.cdap.api.dataset.DataSetException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) Date(java.util.Date) Test(org.junit.Test)

Aggregations

TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)51 Test (org.junit.Test)32 ApplicationManager (io.cdap.cdap.test.ApplicationManager)20 TransactionAware (org.apache.tephra.TransactionAware)18 GenericRecord (org.apache.avro.generic.GenericRecord)17 Schema (io.cdap.cdap.api.data.schema.Schema)14 DataSetException (io.cdap.cdap.api.dataset.DataSetException)14 TransactionExecutor (org.apache.tephra.TransactionExecutor)14 IOException (java.io.IOException)13 DatasetManagementException (io.cdap.cdap.api.dataset.DatasetManagementException)12 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)12 UnauthorizedException (io.cdap.cdap.security.spi.authorization.UnauthorizedException)12 TransactionFailureException (org.apache.tephra.TransactionFailureException)12 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)11 Table (io.cdap.cdap.api.dataset.table.Table)10 Location (org.apache.twill.filesystem.Location)10 ImmutableMap (com.google.common.collect.ImmutableMap)8 Date (java.util.Date)8 Map (java.util.Map)8 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)7