Search in sources :

Example 66 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionKeyCodec method deserialize.

@Override
public PartitionKey deserialize(JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) throws JsonParseException {
    JsonObject jsonObject = jsonElement.getAsJsonObject();
    PartitionKey.Builder builder = PartitionKey.builder();
    for (Map.Entry<String, JsonElement> entry : jsonObject.entrySet()) {
        JsonArray jsonArray = entry.getValue().getAsJsonArray();
        builder.addField(entry.getKey(), deserializeComparable(jsonArray, jsonDeserializationContext));
    }
    return builder.build();
}
Also used : JsonArray(com.google.gson.JsonArray) JsonElement(com.google.gson.JsonElement) JsonObject(com.google.gson.JsonObject) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) Map(java.util.Map)

Example 67 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class ConcurrentPartitionConsumer method abort.

/**
 * Resets the process state of the given partition keys, as they were not successfully processed, or discards the
 * partition if it has already been attempted the configured number of attempts.
 */
protected void abort(ConsumerWorkingSet workingSet, List<? extends PartitionKey> partitionKeys) {
    List<PartitionKey> discardedPartitions = new ArrayList<>();
    for (PartitionKey key : partitionKeys) {
        ConsumablePartition consumablePartition = workingSet.lookup(key);
        assertInProgress(consumablePartition);
        // either reset its processState, or remove it from the workingSet, depending on how many tries it already has
        if (consumablePartition.getNumFailures() < getConfiguration().getMaxRetries()) {
            consumablePartition.retry();
        } else {
            discardedPartitions.add(key);
            workingSet.lookup(key).discard();
        }
    }
    if (!discardedPartitions.isEmpty()) {
        LOG.warn("Discarded keys due to being retried {} times: {}", getConfiguration().getMaxRetries(), discardedPartitions);
    }
}
Also used : ArrayList(java.util.ArrayList) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Example 68 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class SingleWriter method write.

@Override
public void write(K key, V value) throws IOException, InterruptedException {
    PartitionKey partitionKey = dynamicPartitioner.getPartitionKey(key, value);
    if (!partitionKey.equals(currPartitionKey)) {
        // make sure we haven't written to this partition previously
        if (closedKeys.contains(partitionKey)) {
            throw new IllegalStateException(String.format("Encountered a partition key for which the writer has already been closed: '%s'.", partitionKey));
        }
        // currPartitionKey can be null for the first key value pair, in which case there's no writer to close
        if (currPartitionKey != null) {
            // close the existing RecordWriter and create a new one for the new PartitionKEy
            currRecordWriter.close(currContext);
            closedKeys.add(currPartitionKey);
        }
        currPartitionKey = partitionKey;
        currContext = getKeySpecificContext(currPartitionKey);
        currRecordWriter = getBaseRecordWriter(currContext);
    }
    currRecordWriter.write(key, value);
}
Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Example 69 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class MapReduceWithPartitionedTest method testPartitionedFileSetWithMR.

private void testPartitionedFileSetWithMR(boolean useCombineFileInputFormat) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithPartitionedFileSet.class, new AppWithPartitionedFileSet.AppConfig(useCombineFileInputFormat));
    // write a value to the input table
    final Table table = datasetCache.getDataset(AppWithPartitionedFileSet.INPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.put(Bytes.toBytes("x"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
        }
    });
    // a partition key for the map/reduce output
    final PartitionKey keyX = PartitionKey.builder().addStringField("type", "x").addLongField("time", 150000L).build();
    // run the partition writer m/r with this output partition time
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    final PartitionedFileSet dataset = datasetCache.getDataset(PARTITIONED);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyX);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertTrue(path.contains("x"));
            Assert.assertTrue(path.contains("150000"));
        }
    });
    // delete the data in the input table and write a new row
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.delete(Bytes.toBytes("x"));
            table.put(Bytes.toBytes("y"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
        }
    });
    // a new partition key for the next map/reduce
    final PartitionKey keyY = PartitionKey.builder().addStringField("type", "y").addLongField("time", 200000L).build();
    // now run the m/r again with a new partition time, say 5 minutes later
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyY);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("y"));
            Assert.assertTrue(path.contains("200000"));
        }
    });
    // a partition filter that matches the outputs of both map/reduces
    PartitionFilter filterXY = PartitionFilter.builder().addRangeCondition("type", "x", "z").build();
    // now run a map/reduce that reads all the partitions
    runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterXY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "a");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read both partitions - and written both x and y to row a
    final Table output = datasetCache.getDataset(AppWithPartitionedFileSet.OUTPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("a"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
            Assert.assertEquals("2", row.getString("y"));
            Assert.assertEquals("{type=y, time=200000}", row.getString("y_key"));
        }
    });
    // a partition filter that matches the output key of the first map/reduce
    PartitionFilter filterX = PartitionFilter.builder().addValueCondition("type", "x").addRangeCondition("time", null, 160000L).build();
    // now run a map/reduce that reads a range of the partitions, namely the first one
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "b");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read the first partition only - and written only x to row b
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("b"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
            Assert.assertNull(row.get("y"));
            Assert.assertNull(row.get("y_key"));
        }
    });
    // a partition filter that matches no key
    PartitionFilter filterMT = PartitionFilter.builder().addValueCondition("type", "nosuchthing").build();
    // now run a map/reduce that reads an empty range of partitions (the filter matches nothing)
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterMT);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "n");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read no partitions - and written nothing to row n
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("n"));
            Assert.assertTrue(row.isEmpty());
        }
    });
}
Also used : Partition(co.cask.cdap.api.dataset.lib.Partition) Table(co.cask.cdap.api.dataset.table.Table) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) Row(co.cask.cdap.api.dataset.table.Row)

Example 70 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class AlertPublisherSink method prepareRun.

@Override
public void prepareRun(BatchSinkContext context) throws Exception {
    Map<String, String> arguments = new HashMap<>();
    PartitionKey outputPartition = PartitionKey.builder().addStringField("phase", phaseName).build();
    PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputPartition);
    context.addOutput(Output.ofDataset(datasetName, arguments));
}
Also used : HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)70 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)32 Test (org.junit.Test)28 TransactionAware (org.apache.tephra.TransactionAware)19 TransactionExecutor (org.apache.tephra.TransactionExecutor)19 IOException (java.io.IOException)16 HashMap (java.util.HashMap)15 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)12 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 ArrayList (java.util.ArrayList)11 List (java.util.List)11 HashSet (java.util.HashSet)10 DataSetException (co.cask.cdap.api.dataset.DataSetException)9 ImmutableList (com.google.common.collect.ImmutableList)9 Partition (co.cask.cdap.api.dataset.lib.Partition)8 Map (java.util.Map)8 Location (org.apache.twill.filesystem.Location)8 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)7 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)7