use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class PartitionCorrectorTestRun method testPartitionCorrector.
@Test
public void testPartitionCorrector() throws Exception {
ApplicationManager appManager = deployApplication(PartitionExploreCorrectorTestApp.class);
final int numPartitions = 10;
addDatasetInstance(TimePartitionedFileSet.class.getName(), "tpfs", PartitionedFileSetProperties.builder().setExploreFormat("csv").setExploreSchema("key int, value string").setEnableExploreOnCreate(true).build());
DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
long baseTime = date.getTime();
for (int i = 0; i < numPartitions; i++) {
createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
}
validateAllPartitions(numPartitions);
dropAllPartitions();
validateAllPartitions(0);
// all partitions are missing. drop/recrete Hive table and add all partitions
WorkerManager workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "5", "verbose", "true"));
workerManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
validateAllPartitions(numPartitions);
dropAllPartitions();
for (int i = numPartitions; i < 2 * numPartitions; i++) {
createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
}
validateAllPartitions(numPartitions);
// some partitions are missing, some present keep the Hive table and try to add all partitions
workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "8", "verbose", "false", "disable.explore", "false"));
workerManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 60, TimeUnit.SECONDS);
validateAllPartitions(2 * numPartitions);
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.
the class SparkFileSetTestRun method addTimePartition.
private void addTimePartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long inputTime) throws IOException, TransactionFailureException, InterruptedException {
TimePartitionedFileSet tpfs = tpfsManager.get();
PartitionOutput partitionOutput = tpfs.getPartitionOutput(inputTime);
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
tpfsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.
the class SparkFileSetTestRun method testSparkWithTimePartitionedFileSet.
private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
long customOutputPartitionKey = 123456789L;
long customInputPartitionKey = 987654321L;
DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
long inputTime = System.currentTimeMillis();
long outputTime = inputTime + TimeUnit.HOURS.toMillis(1);
addTimePartition(tpfsManager, inputTime);
addTimePartition(tpfsManager, customInputPartitionKey);
Map<String, String> inputArgs = new HashMap<>();
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100);
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100);
Map<String, String> outputArgs = new HashMap<>();
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs));
args.put("input", "tpfs");
args.put("output", "tpfs");
args.put("outputKey", String.valueOf(customOutputPartitionKey));
args.put("inputKey", String.valueOf(customInputPartitionKey));
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
tpfsManager.flush();
TimePartitionedFileSet tpfs = tpfsManager.get();
PartitionDetail partition = tpfs.getPartitionByTime(outputTime);
Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition);
validateFileOutput(partition.getLocation());
PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey);
Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition);
validateFileOutput(customPartition.getLocation());
// Cleanup after running the test
tpfs.dropPartition(inputTime);
tpfs.dropPartition(customInputPartitionKey);
tpfs.dropPartition(partition.getPartitionKey());
tpfs.dropPartition(customPartition.getPartitionKey());
tpfsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.
the class PartitionCorrectorTestRun method testPartitionCorrector.
@Test
public void testPartitionCorrector() throws Exception {
ApplicationManager appManager = deployApplication(PartitionExploreCorrectorTestApp.class);
final int numPartitions = 10;
addDatasetInstance(TimePartitionedFileSet.class.getName(), "tpfs", PartitionedFileSetProperties.builder().setExploreFormat("csv").setExploreSchema("key int, value string").setEnableExploreOnCreate(true).build());
DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
long baseTime = date.getTime();
for (int i = 0; i < numPartitions; i++) {
createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
}
validateAllPartitions(numPartitions);
dropAllPartitions();
validateAllPartitions(0);
// all partitions are missing. drop/recrete Hive table and add all partitions
WorkerManager workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "5", "verbose", "true"));
workerManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
validateAllPartitions(numPartitions);
dropAllPartitions();
for (int i = numPartitions; i < 2 * numPartitions; i++) {
createPartition(tpfsManager, baseTime + TimeUnit.MINUTES.toMillis(1) * i, i);
}
validateAllPartitions(numPartitions);
// some partitions are missing, some present keep the Hive table and try to add all partitions
workerManager = appManager.getWorkerManager("PartitionWorker").start(ImmutableMap.of("dataset.name", "tpfs", "batch.size", "8", "verbose", "false", "disable.explore", "false"));
workerManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 60, TimeUnit.SECONDS);
validateAllPartitions(2 * numPartitions);
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by cdapio.
the class TimePartitionedFileSetTest method testInputPartitionPaths.
/**
* Tests that the TPFS sets the file input paths correctly for the input time range.
*/
@Test
public void testInputPartitionPaths() throws Exception {
// make sure the dataset has no partitions
final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
TransactionAware txAwareDataset = (TransactionAware) tpfs;
TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset);
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
validateTimePartitions(tpfs, 0L, MAX, Collections.<Long, String>emptyMap());
}
});
Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
final long time = date.getTime();
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
tpfs.addPartition(time, "file");
tpfs.addPartition(time + 5 * MINUTE, "file5");
tpfs.addPartition(time + 10 * MINUTE, "file10");
tpfs.addPartition(time + 12 * MINUTE, "file12");
}
});
validateInputPaths(time, -10, -5);
validateInputPaths(time, -10, 2, "file");
validateInputPaths(time, 1, 11, "file5", "file10");
validateInputPaths(time, 1, 15, "file5", "file10", "file12");
validateInputPaths(time, 5, 10, "file5");
}
Aggregations