Search in sources :

Example 1 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMapReduce.

private void runDynamicPartitionerMapReduce(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, boolean expectedStatus) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
    final long now = System.currentTimeMillis();
    final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
    // write values to the input kvTable
    final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
    Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            // the keys are not used; it matters that they're unique though
            for (int i = 0; i < records.size(); i++) {
                kvTable.write(Integer.toString(i), records.get(i).toString());
            }
        }
    });
    String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
    // run the partition writer m/r with this output partition time
    ImmutableMap<String, String> arguments = ImmutableMap.of(OUTPUT_PARTITION_KEY, Long.toString(now), allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
    long startTime = System.currentTimeMillis();
    boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
    Assert.assertEquals(expectedStatus, status);
    if (!expectedStatus) {
        // if we expect the program to fail, no need to check the output data for expected results
        return;
    }
    // Verify notifications
    List<Notification> notifications = getDataNotifications(startTime);
    Assert.assertEquals(1, notifications.size());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
    // this should have created a partition in the pfs
    final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
    final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws IOException {
            Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
            for (PartitionDetail partition : pfs.getPartitions(null)) {
                partitions.put(partition.getPartitionKey(), partition);
                // check that the mapreduce wrote the output partition metadata to all the output partitions
                Assert.assertEquals(AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.METADATA, partition.getMetadata().asMap());
            }
            Assert.assertEquals(3, partitions.size());
            Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
            // Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
            for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
                PartitionDetail partitionDetail = partitionKeyEntry.getValue();
                String relativePath = partitionDetail.getRelativePath();
                int zip = (int) partitionKeyEntry.getKey().getField("zip");
                Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
                Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
            }
            for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
                Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
                Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
            }
        }
    });
}
Also used : HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Set(java.util.Set) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Notification(co.cask.cdap.proto.Notification) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) GenericRecord(org.apache.avro.generic.GenericRecord) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Location(org.apache.twill.filesystem.Location)

Example 2 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.

private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
    PartitionedFileSet pfs = pfsManager.get();
    PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
    Location location = partitionOutput.getLocation();
    prepareFileInput(location);
    partitionOutput.addPartition();
    pfsManager.flush();
    Map<String, String> inputArgs = new HashMap<>();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
    Map<String, String> outputArgs = new HashMap<>();
    PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
    args.put("input", "pfs");
    args.put("output", "pfs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    pfsManager.flush();
    PartitionDetail partition = pfs.getPartition(outputKey);
    Assert.assertNotNull(partition);
    validateFileOutput(partition.getLocation());
    // Cleanup after test completed
    pfs.dropPartition(partitionOutput.getPartitionKey());
    pfs.dropPartition(partition.getPartitionKey());
    pfsManager.flush();
}
Also used : SparkManager(co.cask.cdap.test.SparkManager) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Example 3 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class FileUploadServiceTestRun method testFileUploadService.

@Test
public void testFileUploadService() throws Exception {
    ApplicationManager appManager = deployApplication(FileUploadApp.class);
    // Start the service
    ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
    serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    try {
        // Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
        URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
        // Upload with wrong MD5, should get 400.
        byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
        Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
        long beforeUploadTime = System.currentTimeMillis();
        // Upload with right MD5, should get 200
        Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, BaseEncoding.base64().encode(Hashing.md5().hashBytes(content).asBytes()), 20));
        // Inspect the partitioned file set and verify the content
        PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
        PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
        Assert.assertNotNull(partition);
        // Verify a notification should have been published for the new partition
        List<Notification> notifications = getDataNotifications(beforeUploadTime);
        // Should have one message
        Assert.assertEquals(1, notifications.size());
        verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
        // There should be one file under the partition directory
        List<Location> locations = partition.getLocation().list();
        Assert.assertEquals(1, locations.size());
        Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
        // Verify the tracking table of chunks sizes
        KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
        CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
        // Sum up all chunks sizes as being tracked by the tracking table.
        long sum = 0;
        int iterSize = 0;
        while (iter.hasNext()) {
            KeyValue<byte[], byte[]> kv = iter.next();
            sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
            iterSize++;
        }
        // The iterator should have size >= 2, since we uses different chunk size for two different upload
        Assert.assertTrue(iterSize >= 2);
        // The sum of all chunks sizes should be the same as the
        // content size * 2 (since we have one failure, one success upload)
        Assert.assertEquals(content.length * 2, sum);
    } finally {
        serviceManager.stop();
        serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
    }
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) URI(java.net.URI) Notification(co.cask.cdap.proto.Notification) ServiceManager(co.cask.cdap.test.ServiceManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 4 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class ClicksAndViewsMapReduceTest method getDataFromFile.

private Set<String> getDataFromFile() throws Exception {
    DataSetManager<PartitionedFileSet> cleanRecords = getDataset(ClicksAndViews.JOINED);
    Set<String> cleanData = new HashSet<>();
    // we configured the MapReduce to write to this partition when starting it
    PartitionDetail partition = cleanRecords.get().getPartition(PartitionKey.builder().addLongField("runtime", OUTPUT_PARTITION_RUNTIME).build());
    Assert.assertNotNull(partition);
    for (Location location : partition.getLocation().list()) {
        if (location.getName().startsWith("part-")) {
            try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(location.getInputStream()))) {
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    cleanData.add(line);
                }
            }
        }
    }
    return cleanData;
}
Also used : InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) HashSet(java.util.HashSet) Location(org.apache.twill.filesystem.Location)

Example 5 with PartitionDetail

use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionedFileSetDataset method consumePartitions.

// PartitionConsumerState consists of two things:
// 1) A list of transaction IDs representing the list of transactions in progress during the previous call.
// Each of these transaction IDs need to be checked for new partitions because there may be partitions created by
// those partitions since the previous call.
// 2) A transaction ID from which to start scanning for new partitions. This is an exclusive end range that the
// previous call stopped scanning partitions at.
// Note that each of the transactions IDs in (1) will be smaller than the transactionId in (2).
@ReadWrite
@Override
public PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState, int limit, Predicate<PartitionDetail> predicate) {
    List<Long> previousInProgress = partitionConsumerState.getVersionsToCheck();
    Set<Long> noLongerInProgress = setDiff(previousInProgress, tx.getInProgress());
    List<PartitionDetail> partitions = Lists.newArrayList();
    Iterator<Long> iter = noLongerInProgress.iterator();
    while (iter.hasNext()) {
        Long txId = iter.next();
        if (partitions.size() >= limit) {
            break;
        }
        try (Scanner scanner = partitionsTable.readByIndex(WRITE_PTR_COL, Bytes.toBytes(txId))) {
            scannerToPartitions(scanner, partitions, limit, predicate);
        }
        // remove the txIds as they are added to the partitions list already
        // if they're not removed, they will be persisted in the state for the next scan
        iter.remove();
    }
    // exclusive scan end, to be used as the start for a next call to consumePartitions
    long scanUpTo;
    if (partitions.size() < limit) {
        // no read your own writes (partitions)
        scanUpTo = Math.min(tx.getWritePointer(), tx.getReadPointer() + 1);
        Long endTxId;
        try (Scanner scanner = partitionsTable.scanByIndex(WRITE_PTR_COL, Bytes.toBytes(partitionConsumerState.getStartVersion()), Bytes.toBytes(scanUpTo))) {
            endTxId = scannerToPartitions(scanner, partitions, limit, predicate);
        }
        if (endTxId != null) {
            // nonnull means that the scanner was not exhausted
            scanUpTo = endTxId;
        }
    } else {
        // if we have already hit the limit, don't scan; instead, use the startVersion as the startVersion to the next
        // call to consumePartitions
        scanUpTo = partitionConsumerState.getStartVersion();
    }
    List<Long> inProgressBeforeScanEnd = Lists.newArrayList(noLongerInProgress);
    for (long txId : tx.getInProgress()) {
        if (txId >= scanUpTo) {
            break;
        }
        inProgressBeforeScanEnd.add(txId);
    }
    return new PartitionConsumerResult(new PartitionConsumerState(scanUpTo, inProgressBeforeScanEnd), partitions);
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) PartitionConsumerResult(co.cask.cdap.api.dataset.lib.PartitionConsumerResult) PartitionConsumerState(co.cask.cdap.api.dataset.lib.PartitionConsumerState) AtomicLong(java.util.concurrent.atomic.AtomicLong) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) ReadWrite(co.cask.cdap.api.annotation.ReadWrite)

Aggregations

PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)26 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)18 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)12 Test (org.junit.Test)11 TransactionAware (org.apache.tephra.TransactionAware)10 TransactionExecutor (org.apache.tephra.TransactionExecutor)10 Location (org.apache.twill.filesystem.Location)9 IOException (java.io.IOException)8 HashMap (java.util.HashMap)8 HashSet (java.util.HashSet)8 DataSetException (co.cask.cdap.api.dataset.DataSetException)6 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)5 PartitionAlreadyExistsException (co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException)5 PartitionOutput (co.cask.cdap.api.dataset.lib.PartitionOutput)5 ImmutableMap (com.google.common.collect.ImmutableMap)4 Predicate (co.cask.cdap.api.Predicate)3 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)3 Partition (co.cask.cdap.api.dataset.lib.Partition)3 PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)3 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)3