Search in sources :

Example 6 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class ExploreExecutorHttpHandler method addPartition.

@POST
@Path("datasets/{dataset}/partitions")
public void addPartition(final FullHttpRequest request, final HttpResponder responder, @PathParam("namespace-id") String namespace, @PathParam("dataset") String datasetName, @HeaderParam(Constants.Security.Headers.PROGRAM_ID) String programId) throws Exception {
    final DatasetId datasetId = new DatasetId(namespace, datasetName);
    propagateUserId(request);
    impersonator.doAs(getEntityToImpersonate(datasetId, programId), new Callable<Void>() {

        @Override
        public Void call() throws Exception {
            doPartitionOperation(request, responder, datasetId, new PartitionOperation() {

                @Override
                public QueryHandle submitOperation(PartitionKey partitionKey, Map<String, String> properties) throws ExploreException, SQLException {
                    String fsPath = properties.get("path");
                    if (fsPath == null) {
                        responder.sendString(HttpResponseStatus.BAD_REQUEST, "path was not specified.");
                        return null;
                    }
                    return exploreTableManager.addPartition(datasetId, properties, partitionKey, fsPath);
                }
            });
            return null;
        }
    });
}
Also used : PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) Map(java.util.Map) ExploreException(io.cdap.cdap.explore.service.ExploreException) UnsupportedTypeException(io.cdap.cdap.api.data.schema.UnsupportedTypeException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) SQLException(java.sql.SQLException) JsonSyntaxException(com.google.gson.JsonSyntaxException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) BadRequestException(io.cdap.cdap.common.BadRequestException) DatasetId(io.cdap.cdap.proto.id.DatasetId) Path(javax.ws.rs.Path) POST(javax.ws.rs.POST)

Example 7 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class MapReduceWithPartitionedTest method testPartitionedFileSetWithMR.

private void testPartitionedFileSetWithMR(boolean useCombineFileInputFormat) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithPartitionedFileSet.class, new AppWithPartitionedFileSet.AppConfig(useCombineFileInputFormat));
    // write a value to the input table
    final Table table = datasetCache.getDataset(AppWithPartitionedFileSet.INPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.put(Bytes.toBytes("x"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
        }
    });
    // a partition key for the map/reduce output
    final PartitionKey keyX = PartitionKey.builder().addStringField("type", "x").addLongField("time", 150000L).build();
    // run the partition writer m/r with this output partition time
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    final PartitionedFileSet dataset = datasetCache.getDataset(PARTITIONED);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyX);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertTrue(path.contains("x"));
            Assert.assertTrue(path.contains("150000"));
        }
    });
    // delete the data in the input table and write a new row
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.delete(Bytes.toBytes("x"));
            table.put(Bytes.toBytes("y"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
        }
    });
    // a new partition key for the next map/reduce
    final PartitionKey keyY = PartitionKey.builder().addStringField("type", "y").addLongField("time", 200000L).build();
    // now run the m/r again with a new partition time, say 5 minutes later
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyY);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("y"));
            Assert.assertTrue(path.contains("200000"));
        }
    });
    // a partition filter that matches the outputs of both map/reduces
    PartitionFilter filterXY = PartitionFilter.builder().addRangeCondition("type", "x", "z").build();
    // now run a map/reduce that reads all the partitions
    runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterXY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "a");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read both partitions - and written both x and y to row a
    final Table output = datasetCache.getDataset(AppWithPartitionedFileSet.OUTPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("a"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
            Assert.assertEquals("2", row.getString("y"));
            Assert.assertEquals("{type=y, time=200000}", row.getString("y_key"));
        }
    });
    // a partition filter that matches the output key of the first map/reduce
    PartitionFilter filterX = PartitionFilter.builder().addValueCondition("type", "x").addRangeCondition("time", null, 160000L).build();
    // now run a map/reduce that reads a range of the partitions, namely the first one
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "b");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read the first partition only - and written only x to row b
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("b"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
            Assert.assertNull(row.get("y"));
            Assert.assertNull(row.get("y_key"));
        }
    });
    // a partition filter that matches no key
    PartitionFilter filterMT = PartitionFilter.builder().addValueCondition("type", "nosuchthing").build();
    // now run a map/reduce that reads an empty range of partitions (the filter matches nothing)
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterMT);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "n");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read no partitions - and written nothing to row n
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("n"));
            Assert.assertTrue(row.isEmpty());
        }
    });
}
Also used : Partition(io.cdap.cdap.api.dataset.lib.Partition) Table(io.cdap.cdap.api.dataset.table.Table) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionFilter(io.cdap.cdap.api.dataset.lib.PartitionFilter) ApplicationWithPrograms(io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) Row(io.cdap.cdap.api.dataset.table.Row)

Example 8 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method getOutputFormatClassName.

@Override
public String getOutputFormatClassName() {
    checkNotExternal();
    PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning());
    if (outputKey == null) {
        return "io.cdap.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputFormat";
    }
    return files.getOutputFormatClassName();
}
Also used : PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey)

Example 9 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method parseRowKey.

@VisibleForTesting
static PartitionKey parseRowKey(byte[] rowKey, Partitioning partitioning) {
    PartitionKey.Builder builder = PartitionKey.builder();
    int offset = 0;
    boolean first = true;
    for (Map.Entry<String, FieldType> entry : partitioning.getFields().entrySet()) {
        String fieldName = entry.getKey();
        FieldType fieldType = entry.getValue();
        if (!first) {
            if (offset >= rowKey.length) {
                throw new IllegalArgumentException(String.format("Invalid row key: Expecting field '%s' at offset %d " + "but the end of the row key is reached.", fieldName, offset));
            }
            if (rowKey[offset] != 0) {
                throw new IllegalArgumentException(String.format("Invalid row key: Expecting field separator \\0 before field '%s' at offset %d " + "but found byte value %x.", fieldName, offset, rowKey[offset]));
            }
            offset++;
        }
        first = false;
        int size = FieldTypes.determineLengthInBytes(rowKey, offset, fieldType);
        if (size + offset > rowKey.length) {
            throw new IllegalArgumentException(String.format("Invalid row key: Expecting field '%s' of type %s, " + "requiring %d bytes at offset %d, but only %d bytes remain.", fieldName, fieldType.name(), size, offset, rowKey.length - offset));
        }
        Comparable fieldValue = FieldTypes.fromBytes(rowKey, offset, size, fieldType);
        offset += size;
        builder.addField(fieldName, fieldValue);
    }
    if (offset != rowKey.length) {
        throw new IllegalArgumentException(String.format("Invalid row key: Read all fields at offset %d but %d extra bytes remain.", offset, rowKey.length - offset));
    }
    return builder.build();
}
Also used : PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) FieldType(io.cdap.cdap.api.dataset.lib.Partitioning.FieldType) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 10 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method getInputFormatConfiguration.

@Override
public Map<String, String> getInputFormatConfiguration() {
    Collection<PartitionKey> inputKeys = getInputKeys();
    List<Location> inputLocations = new ArrayList<>(inputKeys.size());
    Map<String, PartitionKey> pathToKey = new HashMap<>(inputKeys.size());
    for (PartitionKey key : inputKeys) {
        PartitionDetail partition = getPartition(key);
        String path = Objects.requireNonNull(partition).getRelativePath();
        Location partitionLocation = files.getLocation(path);
        inputLocations.add(partitionLocation);
        pathToKey.put(partitionLocation.toURI().toString(), key);
    }
    Map<String, String> inputFormatConfiguration = files.getInputFormatConfiguration(inputLocations);
    inputFormatConfiguration.put(PATH_TO_PARTITIONING_MAPPING, GSON.toJson(pathToKey));
    return inputFormatConfiguration;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Aggregations

PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)121 Test (org.junit.Test)55 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)53 TransactionAware (org.apache.tephra.TransactionAware)34 TransactionExecutor (org.apache.tephra.TransactionExecutor)34 IOException (java.io.IOException)26 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)23 ConcurrentPartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)22 PartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer)22 ArrayList (java.util.ArrayList)22 List (java.util.List)22 HashMap (java.util.HashMap)21 ImmutableList (com.google.common.collect.ImmutableList)18 DataSetException (io.cdap.cdap.api.dataset.DataSetException)18 HashSet (java.util.HashSet)18 Partition (io.cdap.cdap.api.dataset.lib.Partition)14 ConsumerConfiguration (io.cdap.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)14 DatasetId (io.cdap.cdap.proto.id.DatasetId)14 Map (java.util.Map)14 Location (org.apache.twill.filesystem.Location)14