Search in sources :

Example 6 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ExploreTableManager method generateFileSetCreateStatement.

/**
 * Generate a create statement for a ((time-)partitioned) file set.
 *
 * @param dataset the instantiated dataset
 * @param datasetId the dataset id
 * @param properties the properties from dataset specification
 * @param truncating whether this call to create() is part of a truncate() operation. The effect is:
 *                   If possessExisting is true, then the truncate() has just dropped this
 *                   dataset and that deleted the explore table: we must recreate it.
 *
 * @return a CREATE TABLE statement, or null if the dataset is not explorable
 * @throws IllegalArgumentException if the schema cannot be parsed, or if shouldErrorOnMissingSchema is true and
 *                                  the dataset spec does not contain a schema.
 */
@Nullable
private String generateFileSetCreateStatement(DatasetId datasetId, Dataset dataset, Map<String, String> properties, boolean truncating) throws IllegalArgumentException, ExploreException {
    String tableName = tableNaming.getTableName(datasetId, properties);
    String databaseName = ExploreProperties.getExploreDatabaseName(properties);
    Map<String, String> tableProperties = FileSetProperties.getTableProperties(properties);
    // if this dataset reuses an existing table, do not attempt to create it
    if (FileSetProperties.isUseExisting(tableProperties) || (FileSetProperties.isPossessExisting(tableProperties) && !truncating)) {
        try {
            exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
            // table exists: do not attempt to create
            return null;
        } catch (TableNotFoundException e) {
            throw new ExploreException(String.format("Dataset '%s' is configured to use an existing explore table, but table '%s' does not " + "exist in database '%s'. ", datasetId.getDataset(), tableName, databaseName));
        }
    }
    Location baseLocation;
    Partitioning partitioning = null;
    if (dataset instanceof PartitionedFileSet) {
        partitioning = ((PartitionedFileSet) dataset).getPartitioning();
        baseLocation = ((PartitionedFileSet) dataset).getEmbeddedFileSet().getBaseLocation();
    } else {
        baseLocation = ((FileSet) dataset).getBaseLocation();
    }
    CreateStatementBuilder createStatementBuilder = new CreateStatementBuilder(datasetId.getDataset(), databaseName, tableName, shouldEscapeColumns).setLocation(baseLocation).setPartitioning(partitioning).setTableProperties(tableProperties);
    String schema = FileSetProperties.getExploreSchema(properties);
    String format = FileSetProperties.getExploreFormat(properties);
    if (format != null) {
        if ("parquet".equals(format)) {
            return createStatementBuilder.setSchema(FileSetProperties.getExploreSchema(properties)).buildWithFileFormat("parquet");
        }
        // for text and csv, we know what to do
        Preconditions.checkArgument("text".equals(format) || "csv".equals(format), "Only text and csv are supported as native formats");
        Preconditions.checkNotNull(schema, "for native formats, explore schema must be given in dataset properties");
        String delimiter = null;
        if ("text".equals(format)) {
            delimiter = FileSetProperties.getExploreFormatProperties(properties).get("delimiter");
        } else if ("csv".equals(format)) {
            delimiter = ",";
        }
        return createStatementBuilder.setSchema(schema).setRowFormatDelimited(delimiter, null).buildWithFileFormat("TEXTFILE");
    } else {
        // They can be created by setting the avro.schema.literal table property
        if (schema != null) {
            createStatementBuilder.setSchema(schema);
        }
        // format not given, look for serde, input format, etc.
        String serde = FileSetProperties.getSerDe(properties);
        String inputFormat = FileSetProperties.getExploreInputFormat(properties);
        String outputFormat = FileSetProperties.getExploreOutputFormat(properties);
        Preconditions.checkArgument(serde != null && inputFormat != null && outputFormat != null, "All of SerDe, InputFormat and OutputFormat must be given in dataset properties");
        return createStatementBuilder.setRowFormatSerde(serde).buildWithFormats(inputFormat, outputFormat);
    }
}
Also used : Partitioning(io.cdap.cdap.api.dataset.lib.Partitioning) CreateStatementBuilder(io.cdap.cdap.explore.table.CreateStatementBuilder) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) Location(org.apache.twill.filesystem.Location) Nullable(javax.annotation.Nullable)

Example 7 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ExploreExecutorHttpHandler method doPartitionOperation.

private void doPartitionOperation(FullHttpRequest request, HttpResponder responder, DatasetId datasetId, PartitionOperation partitionOperation) {
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        Dataset dataset;
        try {
            dataset = datasetInstantiator.getDataset(datasetId);
        } catch (Exception e) {
            LOG.error("Exception instantiating dataset {}.", datasetId, e);
            responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId);
            return;
        }
        try {
            if (!(dataset instanceof PartitionedFileSet)) {
                responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
                return;
            }
            Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
            Reader reader = new InputStreamReader(new ByteBufInputStream(request.content()));
            Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
            }.getType());
            PartitionKey partitionKey;
            try {
                partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
            } catch (Exception e) {
                responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
                return;
            }
            if (partitionKey == null) {
                responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
                return;
            }
            QueryHandle handle = partitionOperation.submitOperation(partitionKey, properties);
            if (handle == null) {
                return;
            }
            JsonObject json = new JsonObject();
            json.addProperty("handle", handle.getHandle());
            responder.sendJson(HttpResponseStatus.OK, json.toString());
        } finally {
            Closeables.closeQuietly(dataset);
        }
    } catch (Throwable e) {
        LOG.error("Got exception:", e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) Dataset(io.cdap.cdap.api.dataset.Dataset) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) JsonObject(com.google.gson.JsonObject) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) ByteBufInputStream(io.netty.buffer.ByteBufInputStream) ExploreException(io.cdap.cdap.explore.service.ExploreException) UnsupportedTypeException(io.cdap.cdap.api.data.schema.UnsupportedTypeException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) SQLException(java.sql.SQLException) JsonSyntaxException(com.google.gson.JsonSyntaxException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) BadRequestException(io.cdap.cdap.common.BadRequestException) Partitioning(io.cdap.cdap.api.dataset.lib.Partitioning) SystemDatasetInstantiator(io.cdap.cdap.data.dataset.SystemDatasetInstantiator) TypeToken(com.google.common.reflect.TypeToken) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) QueryHandle(io.cdap.cdap.proto.QueryHandle)

Example 8 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class MapReduceWithPartitionedTest method testPartitionedFileSetWithMR.

private void testPartitionedFileSetWithMR(boolean useCombineFileInputFormat) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithPartitionedFileSet.class, new AppWithPartitionedFileSet.AppConfig(useCombineFileInputFormat));
    // write a value to the input table
    final Table table = datasetCache.getDataset(AppWithPartitionedFileSet.INPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.put(Bytes.toBytes("x"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
        }
    });
    // a partition key for the map/reduce output
    final PartitionKey keyX = PartitionKey.builder().addStringField("type", "x").addLongField("time", 150000L).build();
    // run the partition writer m/r with this output partition time
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    final PartitionedFileSet dataset = datasetCache.getDataset(PARTITIONED);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyX);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertTrue(path.contains("x"));
            Assert.assertTrue(path.contains("150000"));
        }
    });
    // delete the data in the input table and write a new row
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.delete(Bytes.toBytes("x"));
            table.put(Bytes.toBytes("y"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
        }
    });
    // a new partition key for the next map/reduce
    final PartitionKey keyY = PartitionKey.builder().addStringField("type", "y").addLongField("time", 200000L).build();
    // now run the m/r again with a new partition time, say 5 minutes later
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyY);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("y"));
            Assert.assertTrue(path.contains("200000"));
        }
    });
    // a partition filter that matches the outputs of both map/reduces
    PartitionFilter filterXY = PartitionFilter.builder().addRangeCondition("type", "x", "z").build();
    // now run a map/reduce that reads all the partitions
    runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterXY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "a");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read both partitions - and written both x and y to row a
    final Table output = datasetCache.getDataset(AppWithPartitionedFileSet.OUTPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("a"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
            Assert.assertEquals("2", row.getString("y"));
            Assert.assertEquals("{type=y, time=200000}", row.getString("y_key"));
        }
    });
    // a partition filter that matches the output key of the first map/reduce
    PartitionFilter filterX = PartitionFilter.builder().addValueCondition("type", "x").addRangeCondition("time", null, 160000L).build();
    // now run a map/reduce that reads a range of the partitions, namely the first one
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "b");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read the first partition only - and written only x to row b
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("b"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
            Assert.assertNull(row.get("y"));
            Assert.assertNull(row.get("y_key"));
        }
    });
    // a partition filter that matches no key
    PartitionFilter filterMT = PartitionFilter.builder().addValueCondition("type", "nosuchthing").build();
    // now run a map/reduce that reads an empty range of partitions (the filter matches nothing)
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterMT);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "n");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read no partitions - and written nothing to row n
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("n"));
            Assert.assertTrue(row.isEmpty());
        }
    });
}
Also used : Partition(io.cdap.cdap.api.dataset.lib.Partition) Table(io.cdap.cdap.api.dataset.table.Table) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionFilter(io.cdap.cdap.api.dataset.lib.PartitionFilter) ApplicationWithPrograms(io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) Row(io.cdap.cdap.api.dataset.table.Row)

Example 9 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class PartitionedFileSetDefinition method getDataset.

@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
    // properties must contain the partitioning
    Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
    // make any necessary updates to the arguments
    arguments = updateArgumentsIfNeeded(arguments, partitioning);
    FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
    IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
    return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}
Also used : Partitioning(io.cdap.cdap.api.dataset.lib.Partitioning) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) IndexedTable(io.cdap.cdap.api.dataset.lib.IndexedTable)

Example 10 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class PartitionBatchInput method setInput.

/**
 * Used from the initialize method of the implementing batch job to configure as input a PartitionedFileSet that has
 * specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job.
 * It does this by reading back the previous state, determining the new partitions to read, computing the new
 * state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is
 * passed in.
 *
 * @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is
 *                         configured
 * @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from
 * @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is
 *                       managed
 * @param consumerConfiguration defines parameters for the partition consumption
 * @return a BatchPartitionCommitter used to persist the state of the partition consumer
 */
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext, String partitionedFileSetName, DatasetStatePersistor statePersistor, ConsumerConfiguration consumerConfiguration) {
    PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName);
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(partitionedFileSet, new DelegatingStatePersistor(mapreduceContext, statePersistor), consumerConfiguration);
    final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
    Map<String, String> arguments = new HashMap<>();
    PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions);
    mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments));
    return succeeded -> partitionConsumer.onFinish(consumedPartitions, succeeded);
}
Also used : Input(io.cdap.cdap.api.data.batch.Input) DatasetStatePersistor(io.cdap.cdap.api.dataset.lib.DatasetStatePersistor) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) List(java.util.List) PartitionedFileSetArguments(io.cdap.cdap.api.dataset.lib.PartitionedFileSetArguments) Beta(io.cdap.cdap.api.annotation.Beta) Map(java.util.Map) Partition(io.cdap.cdap.api.dataset.lib.Partition) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) MapReduceContext(io.cdap.cdap.api.mapreduce.MapReduceContext) HashMap(java.util.HashMap) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Aggregations

PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)112 Test (org.junit.Test)75 PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)53 Location (org.apache.twill.filesystem.Location)47 TransactionAware (org.apache.tephra.TransactionAware)44 TransactionExecutor (org.apache.tephra.TransactionExecutor)44 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)28 IOException (java.io.IOException)26 DataSetException (io.cdap.cdap.api.dataset.DataSetException)24 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)24 List (java.util.List)24 PartitionNotFoundException (io.cdap.cdap.api.dataset.PartitionNotFoundException)22 PartitionAlreadyExistsException (io.cdap.cdap.api.dataset.lib.PartitionAlreadyExistsException)22 ConcurrentPartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)22 PartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer)22 HashSet (java.util.HashSet)19 ImmutableList (com.google.common.collect.ImmutableList)18 ArrayList (java.util.ArrayList)18 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)16 TransactionContext (org.apache.tephra.TransactionContext)16