Search in sources :

Example 6 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class ClicksAndViewsMapReduceTest method getDataFromFile.

private Set<String> getDataFromFile() throws Exception {
    DataSetManager<PartitionedFileSet> cleanRecords = getDataset(ClicksAndViews.JOINED);
    Set<String> cleanData = new HashSet<>();
    // we configured the MapReduce to write to this partition when starting it
    PartitionDetail partition = cleanRecords.get().getPartition(PartitionKey.builder().addLongField("runtime", OUTPUT_PARTITION_RUNTIME).build());
    Assert.assertNotNull(partition);
    for (Location location : partition.getLocation().list()) {
        if (location.getName().startsWith("part-")) {
            try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(location.getInputStream()))) {
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    cleanData.add(line);
                }
            }
        }
    }
    return cleanData;
}
Also used : InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) HashSet(java.util.HashSet) Location(org.apache.twill.filesystem.Location)

Example 7 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class ConcurrentPartitionConsumer method selectPartitions.

private List<PartitionDetail> selectPartitions(PartitionAcceptor acceptor, ConsumerWorkingSet workingSet) {
    long now = System.currentTimeMillis();
    List<PartitionDetail> toConsume = new ArrayList<>();
    Iterator<ConsumablePartition> iter = workingSet.getPartitions().iterator();
    while (iter.hasNext()) {
        ConsumablePartition consumablePartition = iter.next();
        if (ProcessState.AVAILABLE != consumablePartition.getProcessState()) {
            continue;
        }
        PartitionDetail partition = getPartitionedFileSet().getPartition(consumablePartition.getPartitionKey());
        if (partition == null) {
            // no longer exists, so skip it and remove it from the working set
            iter.remove();
            continue;
        }
        PartitionAcceptor.Return accept = acceptor.accept(partition);
        switch(accept) {
            case ACCEPT:
                consumablePartition.take();
                consumablePartition.setTimestamp(now);
                toConsume.add(partition);
                continue;
            case SKIP:
                continue;
            case STOP:
                return toConsume;
        }
    }
    return toConsume;
}
Also used : ArrayList(java.util.ArrayList) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail)

Example 8 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class ConsumerWorkingSet method populate.

/**
   * Populates the ConsumerWorkingSet by fetching partitions from the given PartitionedFileSet.
   *
   * @param partitionedFileSet the PartitionedFileSet to fetch partitions from
   * @param configuration the ConsumerConfiguration which defines parameters for consuming
   */
public void populate(PartitionedFileSet partitionedFileSet, ConsumerConfiguration configuration) {
    int numToPopulate = configuration.getMaxWorkingSetSize() - partitions.size();
    Predicate<PartitionDetail> predicate = configuration.getPartitionPredicate();
    co.cask.cdap.api.dataset.lib.PartitionConsumerResult result = partitionedFileSet.consumePartitions(partitionConsumerState, numToPopulate, predicate);
    List<PartitionDetail> partitions = result.getPartitions();
    for (PartitionDetail partition : partitions) {
        addPartition(partition.getPartitionKey());
    }
    partitionConsumerState = result.getPartitionConsumerState();
}
Also used : PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail)

Example 9 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMapReduce.

private void runDynamicPartitionerMapReduce(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, boolean expectedStatus) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
    final long now = System.currentTimeMillis();
    final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
    // write values to the input kvTable
    final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
    Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            // the keys are not used; it matters that they're unique though
            for (int i = 0; i < records.size(); i++) {
                kvTable.write(Integer.toString(i), records.get(i).toString());
            }
        }
    });
    String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
    // run the partition writer m/r with this output partition time
    ImmutableMap<String, String> arguments = ImmutableMap.of(OUTPUT_PARTITION_KEY, Long.toString(now), allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
    long startTime = System.currentTimeMillis();
    boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
    Assert.assertEquals(expectedStatus, status);
    if (!expectedStatus) {
        // if we expect the program to fail, no need to check the output data for expected results
        return;
    }
    // Verify notifications
    List<Notification> notifications = getDataNotifications(startTime);
    Assert.assertEquals(1, notifications.size());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
    // this should have created a partition in the pfs
    final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
    final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws IOException {
            Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
            for (PartitionDetail partition : pfs.getPartitions(null)) {
                partitions.put(partition.getPartitionKey(), partition);
                // check that the mapreduce wrote the output partition metadata to all the output partitions
                Assert.assertEquals(AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.METADATA, partition.getMetadata().asMap());
            }
            Assert.assertEquals(3, partitions.size());
            Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
            // Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
            for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
                PartitionDetail partitionDetail = partitionKeyEntry.getValue();
                String relativePath = partitionDetail.getRelativePath();
                int zip = (int) partitionKeyEntry.getKey().getField("zip");
                Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
                Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
            }
            for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
                Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
                Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
            }
        }
    });
}
Also used : HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Set(java.util.Set) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Notification(co.cask.cdap.proto.Notification) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) GenericRecord(org.apache.avro.generic.GenericRecord) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Location(org.apache.twill.filesystem.Location)

Example 10 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class ConnectorSource method prepareRun.

@Override
public void prepareRun(BatchSourceContext context) throws Exception {
    Map<String, String> arguments = new HashMap<>();
    PartitionedFileSet inputFileset = context.getDataset(datasetName);
    for (PartitionDetail partitionDetail : inputFileset.getPartitions(PartitionFilter.ALWAYS_MATCH)) {
        PartitionedFileSetArguments.addInputPartition(arguments, partitionDetail);
    }
    context.setInput(Input.ofDataset(datasetName, arguments));
}
Also used : HashMap(java.util.HashMap) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail)

Aggregations

PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)25 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)17 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)11 Test (org.junit.Test)11 TransactionAware (org.apache.tephra.TransactionAware)9 TransactionExecutor (org.apache.tephra.TransactionExecutor)9 Location (org.apache.twill.filesystem.Location)8 IOException (java.io.IOException)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 DataSetException (co.cask.cdap.api.dataset.DataSetException)6 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)5 PartitionOutput (co.cask.cdap.api.dataset.lib.PartitionOutput)5 Predicate (co.cask.cdap.api.Predicate)3 PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)3 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)3 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)3 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)3 ApplicationManager (co.cask.cdap.test.ApplicationManager)3 ServiceManager (co.cask.cdap.test.ServiceManager)3