Search in sources :

Example 21 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionedFileSetTest method testAddRemoveGetPartitions.

@Test
@Category(SlowTests.class)
public void testAddRemoveGetPartitions() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final PartitionKey[][][] keys = new PartitionKey[4][4][4];
    final String[][][] paths = new String[4][4][4];
    final Set<BasicPartition> allPartitionDetails = Sets.newHashSet();
    // add a bunch of partitions
    for (int s = 0; s < 4; s++) {
        for (int i = 0; i < 4; i++) {
            for (int l = 0; l < 4; l++) {
                final PartitionKey key = PartitionKey.builder().addField("s", String.format("%c-%d", 'a' + s, s)).addField("i", i * 100).addField("l", 15L - 10 * l).build();
                BasicPartition basicPartition = dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new Callable<BasicPartition>() {

                    @Override
                    public BasicPartition call() throws Exception {
                        PartitionOutput p = dataset.getPartitionOutput(key);
                        p.addPartition();
                        return new BasicPartition((PartitionedFileSetDataset) dataset, p.getRelativePath(), p.getPartitionKey());
                    }
                });
                keys[s][i][l] = key;
                paths[s][i][l] = basicPartition.getRelativePath();
                allPartitionDetails.add(basicPartition);
            }
        }
    }
    // validate getPartition with exact partition key
    for (int s = 0; s < 4; s++) {
        for (int i = 0; i < 4; i++) {
            for (int l = 0; l < 4; l++) {
                final PartitionKey key = keys[s][i][l];
                final String path = paths[s][i][l];
                dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

                    @Override
                    public void apply() throws Exception {
                        PartitionDetail partitionDetail = dataset.getPartition(key);
                        Assert.assertNotNull(partitionDetail);
                        Assert.assertEquals(path, partitionDetail.getRelativePath());
                    }
                });
                // also test getPartitionPaths() and getPartitions() for the filter matching this
                @SuppressWarnings({ "unchecked", "unused" }) boolean success = testFilter(dataset, allPartitionDetails, PartitionFilter.builder().addValueCondition("l", key.getField("l")).addValueCondition("s", key.getField("s")).addValueCondition("i", key.getField("i")).build());
            }
        }
    }
    // test whether query works without filter
    testFilter(dataset, allPartitionDetails, null);
    // generate an list of partition filters with exhaustive coverage
    List<PartitionFilter> filters = generateFilters();
    // test all kinds of filters
    testAllFilters(dataset, allPartitionDetails, filters);
    // remove a few of the partitions and test again, repeatedly
    PartitionKey[] keysToRemove = { keys[1][2][3], keys[0][1][0], keys[2][3][2], keys[3][1][2] };
    for (final PartitionKey key : keysToRemove) {
        // remove in a transaction
        dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Procedure<PartitionKey>() {

            @Override
            public void apply(PartitionKey partitionKey) throws Exception {
                dataset.dropPartition(partitionKey);
            }
        }, key);
        // test all filters
        BasicPartition toRemove = Iterables.tryFind(allPartitionDetails, new com.google.common.base.Predicate<BasicPartition>() {

            @Override
            public boolean apply(BasicPartition partition) {
                return key.equals(partition.getPartitionKey());
            }
        }).get();
        allPartitionDetails.remove(toRemove);
        testAllFilters(dataset, allPartitionDetails, filters);
    }
}
Also used : PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Predicate(co.cask.cdap.api.Predicate) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionNotFoundException(co.cask.cdap.api.dataset.PartitionNotFoundException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException) PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 22 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionConsumerTest method testSimpleConcurrency.

@Test
public void testSimpleConcurrency() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys = new HashSet<>();
    for (int i = 0; i < 10; i++) {
        partitionKeys.add(generateUniqueKey());
    }
    // have ConcurrentPartitionConsumers that share the same state.
    InMemoryStatePersistor persistor = new InMemoryStatePersistor();
    ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(3).build();
    final PartitionConsumer partitionConsumer1 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
    final PartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
    final PartitionConsumer partitionConsumer3 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
    // add all ten keys to the partitioned fileset
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (final PartitionKey partitionKey : partitionKeys) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions
            List<PartitionDetail> consumedBy1 = partitionConsumer1.consumePartitions(1).getPartitions();
            Assert.assertEquals(1, consumedBy1.size());
            // partitionConsumer2 asks for 10 partitions, but 1 is currently in progress by partitionConsumer1, so it only
            // gets the remaining 9 partitions
            List<PartitionDetail> consumedBy2 = partitionConsumer2.consumePartitions(10).getPartitions();
            Assert.assertEquals(9, consumedBy2.size());
            // partitionConsumer3 tries to consume partitions, but all are marked in-progress by partitionConsumer 1 and 2
            Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size());
            // partitionConsumer1 aborts its partition, so it then becomes available for partitionConsumer3
            partitionConsumer1.onFinish(consumedBy1, false);
            consumedBy1.clear();
            // queries with limit=2, but only the 1 is available that partitionConsumer1 released
            List<PartitionDetail> consumedBy3 = partitionConsumer3.consumePartitions(2).getPartitions();
            Assert.assertEquals(1, consumedBy3.size());
            // partitionConsumers 2 and 3 marks that it successfully processed the partitions
            partitionConsumer3.onFinish(consumedBy3, true);
            // test onFinishWithKeys API
            List<PartitionKey> keysConsumedBy2 = Lists.transform(consumedBy2, new Function<PartitionDetail, PartitionKey>() {

                @Override
                public PartitionKey apply(PartitionDetail input) {
                    return input.getPartitionKey();
                }
            });
            partitionConsumer2.onFinishWithKeys(keysConsumedBy2, true);
            // at this point, all partitions are processed, so no additional partitions are available for consumption
            Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size());
            List<PartitionDetail> allProcessedPartitions = new ArrayList<>();
            allProcessedPartitions.addAll(consumedBy1);
            allProcessedPartitions.addAll(consumedBy2);
            allProcessedPartitions.addAll(consumedBy3);
            // ordering may be different, since all the partitions were added in the same transaction
            Assert.assertEquals(partitionKeys, toKeys(allProcessedPartitions));
        }
    });
}
Also used : ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Function(com.google.common.base.Function) TransactionAware(org.apache.tephra.TransactionAware) ConsumerConfiguration(co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 23 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionedFileSetDataset method getInputFormatConfiguration.

@Override
public Map<String, String> getInputFormatConfiguration() {
    Collection<PartitionKey> inputKeys = getInputKeys();
    List<Location> inputLocations = new ArrayList<>(inputKeys.size());
    Map<String, PartitionKey> pathToKey = new HashMap<>(inputKeys.size());
    for (PartitionKey key : inputKeys) {
        PartitionDetail partition = getPartition(key);
        String path = Objects.requireNonNull(partition).getRelativePath();
        Location partitionLocation = files.getLocation(path);
        inputLocations.add(partitionLocation);
        pathToKey.put(partitionLocation.toURI().toString(), key);
    }
    Map<String, String> inputFormatConfiguration = files.getInputFormatConfiguration(inputLocations);
    inputFormatConfiguration.put(PATH_TO_PARTITIONING_MAPPING, GSON.toJson(pathToKey));
    return inputFormatConfiguration;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Example 24 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithTimePartitionedFileSet.

private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    long customOutputPartitionKey = 123456789L;
    long customInputPartitionKey = 987654321L;
    DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
    long inputTime = System.currentTimeMillis();
    long outputTime = inputTime + TimeUnit.HOURS.toMillis(1);
    addTimePartition(tpfsManager, inputTime);
    addTimePartition(tpfsManager, customInputPartitionKey);
    Map<String, String> inputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100);
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100);
    Map<String, String> outputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs));
    args.put("input", "tpfs");
    args.put("output", "tpfs");
    args.put("outputKey", String.valueOf(customOutputPartitionKey));
    args.put("inputKey", String.valueOf(customInputPartitionKey));
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    tpfsManager.flush();
    TimePartitionedFileSet tpfs = tpfsManager.get();
    PartitionDetail partition = tpfs.getPartitionByTime(outputTime);
    Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition);
    validateFileOutput(partition.getLocation());
    PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey);
    Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition);
    validateFileOutput(customPartition.getLocation());
    // Cleanup after running the test
    tpfs.getPartitionOutput(inputTime).getLocation().delete(true);
    tpfs.getPartitionOutput(customInputPartitionKey).getLocation().delete(true);
    partition.getLocation().delete(true);
    customPartition.getLocation().delete(true);
    tpfs.dropPartition(inputTime);
    tpfs.dropPartition(customInputPartitionKey);
    tpfs.dropPartition(partition.getPartitionKey());
    tpfs.dropPartition(customPartition.getPartitionKey());
    tpfsManager.flush();
}
Also used : SparkManager(co.cask.cdap.test.SparkManager) HashMap(java.util.HashMap) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail)

Example 25 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionConsumingTestRun method testWordCountOnFileSet.

private void testWordCountOnFileSet(Function<ApplicationManager, ProgramManager> runProgram, boolean produceOutputPartitionEachRun) throws Exception {
    ApplicationManager applicationManager = deployApplication(AppWithPartitionConsumers.class);
    ServiceManager serviceManager = applicationManager.getServiceManager("DatasetService").start();
    serviceManager.waitForStatus(true);
    URL serviceURL = serviceManager.getServiceURL();
    // write a file to the file set using the service and run the WordCount MapReduce job on that one partition
    createPartition(serviceURL, LINE1, "1");
    ProgramManager programManager = runProgram.apply(applicationManager);
    programManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    Assert.assertEquals(new Long(2), getCount(serviceURL, "a"));
    Assert.assertEquals(new Long(1), getCount(serviceURL, "b"));
    Assert.assertEquals(new Long(0), getCount(serviceURL, "c"));
    // create two additional partitions
    createPartition(serviceURL, LINE2, "2");
    createPartition(serviceURL, LINE3, "3");
    // running the program job now processes these two new partitions (LINE2 and LINE3) and updates the counts
    // dataset accordingly
    programManager = runProgram.apply(applicationManager);
    programManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
    Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
    // running the program without adding new partitions does not affect the counts dataset
    programManager = runProgram.apply(applicationManager);
    programManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
    Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
    DataSetManager<PartitionedFileSet> outputLines = getDataset("outputLines");
    Set<PartitionDetail> partitions = outputLines.get().getPartitions(PartitionFilter.ALWAYS_MATCH);
    // each of the three MapReduce runs produces an output partition (even if there's no input data)
    // however, Worker run doesn't produce a new output partition if there's no new input partition
    Assert.assertEquals(produceOutputPartitionEachRun ? 3 : 2, partitions.size());
    // we only store the counts to the "outputLines" dataset
    List<String> expectedCounts = Lists.newArrayList("1", "1", "2", "2", "3");
    List<String> outputRecords = getDataFromExplore("outputLines");
    Collections.sort(outputRecords);
    Assert.assertEquals(expectedCounts, outputRecords);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) ServiceManager(co.cask.cdap.test.ServiceManager) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) URL(java.net.URL) ProgramManager(co.cask.cdap.test.ProgramManager)

Aggregations

PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)25 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)17 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)11 Test (org.junit.Test)11 TransactionAware (org.apache.tephra.TransactionAware)9 TransactionExecutor (org.apache.tephra.TransactionExecutor)9 Location (org.apache.twill.filesystem.Location)8 IOException (java.io.IOException)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 DataSetException (co.cask.cdap.api.dataset.DataSetException)6 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)5 PartitionOutput (co.cask.cdap.api.dataset.lib.PartitionOutput)5 Predicate (co.cask.cdap.api.Predicate)3 PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)3 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)3 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)3 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)3 ApplicationManager (co.cask.cdap.test.ApplicationManager)3 ServiceManager (co.cask.cdap.test.ServiceManager)3