Search in sources :

Example 16 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class AbstractContext method createRuntimeProgramContext.

/**
   * Creates a new instance of {@link RuntimeProgramContext} to be
   * provided to {@link RuntimeProgramContextAware} dataset.
   */
private RuntimeProgramContext createRuntimeProgramContext(final DatasetId datasetId) {
    return new RuntimeProgramContext() {

        @Override
        public void notifyNewPartitions(Collection<? extends PartitionKey> partitionKeys) throws IOException {
            String topic = cConf.get(Constants.Dataset.DATA_EVENT_TOPIC);
            if (Strings.isNullOrEmpty(topic)) {
                // Don't publish if there is no data event topic
                return;
            }
            TopicId dataEventTopic = NamespaceId.SYSTEM.topic(topic);
            MessagePublisher publisher = getMessagingContext().getMessagePublisher();
            byte[] payload = Bytes.toBytes(GSON.toJson(Notification.forPartitions(datasetId, partitionKeys)));
            int failure = 0;
            long startTime = System.currentTimeMillis();
            while (true) {
                try {
                    publisher.publish(dataEventTopic.getNamespace(), dataEventTopic.getTopic(), payload);
                    return;
                } catch (TopicNotFoundException e) {
                    // this shouldn't happen since the TMS creates the data event topic on startup.
                    throw new IOException("Unexpected exception due to missing topic '" + dataEventTopic + "'", e);
                } catch (IOException e) {
                    long sleepTime = retryStrategy.nextRetry(++failure, startTime);
                    if (sleepTime < 0) {
                        throw e;
                    }
                    try {
                        TimeUnit.MILLISECONDS.sleep(sleepTime);
                    } catch (InterruptedException ex) {
                        // If interrupted during sleep, just reset the interrupt flag and return
                        Thread.currentThread().interrupt();
                        return;
                    }
                }
            }
        }

        @Override
        public ProgramRunId getProgramRunId() {
            return programRunId;
        }

        @Nullable
        @Override
        public NamespacedEntityId getComponentId() {
            return AbstractContext.this.getComponentId();
        }
    };
}
Also used : RuntimeProgramContext(co.cask.cdap.data.RuntimeProgramContext) MessagePublisher(co.cask.cdap.api.messaging.MessagePublisher) TopicNotFoundException(co.cask.cdap.api.messaging.TopicNotFoundException) Collection(java.util.Collection) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) TopicId(co.cask.cdap.proto.id.TopicId) IOException(java.io.IOException)

Example 17 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class DynamicPartitionerWithAvroTest method groupByPartitionKey.

private Multimap<PartitionKey, GenericRecord> groupByPartitionKey(List<? extends GenericRecord> records, long now) {
    HashMultimap<PartitionKey, GenericRecord> groupedByPartitionKey = HashMultimap.create();
    for (GenericRecord record : records) {
        PartitionKey key = PartitionKey.builder().addLongField("time", now).addIntField("zip", (int) record.get("zip")).build();
        groupedByPartitionKey.put(key, record);
    }
    return groupedByPartitionKey;
}
Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 18 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMapReduce.

private void runDynamicPartitionerMapReduce(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, boolean expectedStatus) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
    final long now = System.currentTimeMillis();
    final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
    // write values to the input kvTable
    final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
    Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            // the keys are not used; it matters that they're unique though
            for (int i = 0; i < records.size(); i++) {
                kvTable.write(Integer.toString(i), records.get(i).toString());
            }
        }
    });
    String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
    // run the partition writer m/r with this output partition time
    ImmutableMap<String, String> arguments = ImmutableMap.of(OUTPUT_PARTITION_KEY, Long.toString(now), allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
    long startTime = System.currentTimeMillis();
    boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
    Assert.assertEquals(expectedStatus, status);
    if (!expectedStatus) {
        // if we expect the program to fail, no need to check the output data for expected results
        return;
    }
    // Verify notifications
    List<Notification> notifications = getDataNotifications(startTime);
    Assert.assertEquals(1, notifications.size());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
    // this should have created a partition in the pfs
    final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
    final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws IOException {
            Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
            for (PartitionDetail partition : pfs.getPartitions(null)) {
                partitions.put(partition.getPartitionKey(), partition);
                // check that the mapreduce wrote the output partition metadata to all the output partitions
                Assert.assertEquals(AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.METADATA, partition.getMetadata().asMap());
            }
            Assert.assertEquals(3, partitions.size());
            Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
            // Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
            for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
                PartitionDetail partitionDetail = partitionKeyEntry.getValue();
                String relativePath = partitionDetail.getRelativePath();
                int zip = (int) partitionKeyEntry.getKey().getField("zip");
                Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
                Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
            }
            for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
                Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
                Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
            }
        }
    });
}
Also used : HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Set(java.util.Set) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Notification(co.cask.cdap.proto.Notification) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) GenericRecord(org.apache.avro.generic.GenericRecord) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Location(org.apache.twill.filesystem.Location)

Example 19 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class ExploreExecutorHttpHandler method doDropPartition.

private void doDropPartition(HttpRequest request, HttpResponder responder, DatasetId datasetId) {
    Dataset dataset;
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        dataset = datasetInstantiator.getDataset(datasetId);
        if (dataset == null) {
            responder.sendString(HttpResponseStatus.NOT_FOUND, "Cannot load dataset " + datasetId);
            return;
        }
    } catch (IOException e) {
        String classNotFoundMessage = isClassNotFoundException(e);
        if (classNotFoundMessage != null) {
            JsonObject json = new JsonObject();
            json.addProperty("handle", QueryHandle.NO_OP.getHandle());
            responder.sendJson(HttpResponseStatus.OK, json);
            return;
        }
        LOG.error("Exception instantiating dataset {}.", datasetId, e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId);
        return;
    }
    try {
        if (!(dataset instanceof PartitionedFileSet)) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
            return;
        }
        Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
        Reader reader = new InputStreamReader(new ChannelBufferInputStream(request.getContent()));
        Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
        }.getType());
        PartitionKey partitionKey;
        try {
            partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
        } catch (Exception e) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
            return;
        }
        if (partitionKey == null) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
            return;
        }
        QueryHandle handle = exploreTableManager.dropPartition(datasetId, properties, partitionKey);
        JsonObject json = new JsonObject();
        json.addProperty("handle", handle.getHandle());
        responder.sendJson(HttpResponseStatus.OK, json);
    } catch (Throwable e) {
        LOG.error("Got exception:", e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) Dataset(co.cask.cdap.api.dataset.Dataset) JsonObject(com.google.gson.JsonObject) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) BadRequestException(co.cask.cdap.common.BadRequestException) ExploreException(co.cask.cdap.explore.service.ExploreException) SQLException(java.sql.SQLException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) JsonSyntaxException(com.google.gson.JsonSyntaxException) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) IOException(java.io.IOException) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) SystemDatasetInstantiator(co.cask.cdap.data.dataset.SystemDatasetInstantiator) TypeToken(com.google.common.reflect.TypeToken) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ChannelBufferInputStream(org.jboss.netty.buffer.ChannelBufferInputStream) QueryHandle(co.cask.cdap.proto.QueryHandle)

Example 20 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class DynamicPartitioningOutputCommitter method commitJob.

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);
    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    outputDataset = taskContext.getDataset(outputDatasetName);
    Partitioning partitioning = outputDataset.getPartitioning();
    Set<PartitionKey> partitionsToAdd = new HashSet<>();
    relativePaths = new HashSet<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);
            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            String fileName = relativePath.substring(lastPathSepIdx + 1);
            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            if (fs.exists(finalDir)) {
                throw new FileAlreadyExistsException("Final output path " + finalDir + " already exists");
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.add(partitionKey);
            relativePaths.add(relativeDir);
        }
    }
    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileSystem fs = finalOutput.getFileSystem(configuration);
    for (FileStatus stat : getAllCommittedTaskPaths(context)) {
        mergePaths(fs, stat, finalOutput);
    }
    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);
    // create all the necessary partitions
    for (PartitionKey partitionKey : partitionsToAdd) {
        PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
        partitionOutput.setMetadata(metadata);
        partitionOutput.addPartition();
    }
    // close the TaskContext, which flushes dataset operations
    try {
        taskContext.flushOperations();
    } catch (Exception e) {
        Throwables.propagateIfPossible(e, IOException.class);
        throw new IOException(e);
    }
    // delete the job-specific _temporary folder and create a _done file in the o/p folder
    cleanupJob(context);
    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : relativePaths) {
            Path pathToMark = new Path(finalOutput, relativePath);
            Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
            fs.createNewFile(markerPath);
        }
    }
}
Also used : BasicMapReduceTaskContext(co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext) Path(org.apache.hadoop.fs.Path) MapReduceClassLoader(co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Configuration(org.apache.hadoop.conf.Configuration) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) IOException(java.io.IOException) FileAlreadyExistsException(org.apache.hadoop.mapred.FileAlreadyExistsException) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) HashSet(java.util.HashSet)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)59 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)28 Test (org.junit.Test)27 TransactionAware (org.apache.tephra.TransactionAware)17 TransactionExecutor (org.apache.tephra.TransactionExecutor)17 IOException (java.io.IOException)12 HashMap (java.util.HashMap)12 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)11 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 ArrayList (java.util.ArrayList)11 List (java.util.List)11 HashSet (java.util.HashSet)10 DataSetException (co.cask.cdap.api.dataset.DataSetException)9 ImmutableList (com.google.common.collect.ImmutableList)9 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)7 Partition (co.cask.cdap.api.dataset.lib.Partition)7 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)7 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)6 Location (org.apache.twill.filesystem.Location)6