Search in sources :

Example 11 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.

private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
    PartitionedFileSet pfs = pfsManager.get();
    PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
    Location location = partitionOutput.getLocation();
    prepareFileInput(location);
    partitionOutput.addPartition();
    pfsManager.flush();
    Map<String, String> inputArgs = new HashMap<>();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
    Map<String, String> outputArgs = new HashMap<>();
    PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
    args.put("input", "pfs");
    args.put("output", "pfs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    pfsManager.flush();
    PartitionDetail partition = pfs.getPartition(outputKey);
    Assert.assertNotNull(partition);
    validateFileOutput(partition.getLocation());
    // Cleanup after test completed
    pfs.dropPartition(partitionOutput.getPartitionKey());
    pfs.dropPartition(partition.getPartitionKey());
    pfsManager.flush();
}
Also used : SparkManager(io.cdap.cdap.test.SparkManager) PartitionOutput(io.cdap.cdap.api.dataset.lib.PartitionOutput) HashMap(java.util.HashMap) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Example 12 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithTimePartitionedFileSet.

private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    long customOutputPartitionKey = 123456789L;
    long customInputPartitionKey = 987654321L;
    DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
    long inputTime = System.currentTimeMillis();
    long outputTime = inputTime + TimeUnit.HOURS.toMillis(1);
    addTimePartition(tpfsManager, inputTime);
    addTimePartition(tpfsManager, customInputPartitionKey);
    Map<String, String> inputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100);
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100);
    Map<String, String> outputArgs = new HashMap<>();
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs));
    args.put("input", "tpfs");
    args.put("output", "tpfs");
    args.put("outputKey", String.valueOf(customOutputPartitionKey));
    args.put("inputKey", String.valueOf(customInputPartitionKey));
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    tpfsManager.flush();
    TimePartitionedFileSet tpfs = tpfsManager.get();
    PartitionDetail partition = tpfs.getPartitionByTime(outputTime);
    Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition);
    validateFileOutput(partition.getLocation());
    PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey);
    Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition);
    validateFileOutput(customPartition.getLocation());
    // Cleanup after running the test
    tpfs.dropPartition(inputTime);
    tpfs.dropPartition(customInputPartitionKey);
    tpfs.dropPartition(partition.getPartitionKey());
    tpfs.dropPartition(customPartition.getPartitionKey());
    tpfsManager.flush();
}
Also used : SparkManager(io.cdap.cdap.test.SparkManager) HashMap(java.util.HashMap) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Example 13 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionConsumingTestRun method testWordCountOnFileSet.

private void testWordCountOnFileSet(Function<ApplicationManager, ProgramManager> runProgram, boolean produceOutputPartitionEachRun) throws Exception {
    ApplicationManager applicationManager = deployApplication(AppWithPartitionConsumers.class);
    ServiceManager serviceManager = applicationManager.getServiceManager("DatasetService").start();
    serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    URL serviceURL = serviceManager.getServiceURL();
    // write a file to the file set using the service and run the WordCount MapReduce job on that one partition
    createPartition(serviceURL, LINE1, "1");
    ProgramManager programManager = runProgram.apply(applicationManager);
    programManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    Assert.assertEquals(new Long(2), getCount(serviceURL, "a"));
    Assert.assertEquals(new Long(1), getCount(serviceURL, "b"));
    Assert.assertEquals(new Long(0), getCount(serviceURL, "c"));
    // create two additional partitions
    createPartition(serviceURL, LINE2, "2");
    createPartition(serviceURL, LINE3, "3");
    // running the program job now processes these two new partitions (LINE2 and LINE3) and updates the counts
    // dataset accordingly
    programManager = runProgram.apply(applicationManager);
    programManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
    Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
    // running the program without adding new partitions does not affect the counts dataset
    programManager = runProgram.apply(applicationManager);
    programManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
    Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
    Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
    DataSetManager<PartitionedFileSet> outputLines = getDataset("outputLines");
    Set<PartitionDetail> partitions = outputLines.get().getPartitions(PartitionFilter.ALWAYS_MATCH);
    // each of the three MapReduce runs produces an output partition (even if there's no input data)
    // however, Worker run doesn't produce a new output partition if there's no new input partition
    Assert.assertEquals(produceOutputPartitionEachRun ? 3 : 2, partitions.size());
    // we only store the counts to the "outputLines" dataset
    List<String> expectedCounts = Lists.newArrayList("1", "1", "2", "2", "3");
    List<String> outputRecords = getDataFromExplore("outputLines");
    Collections.sort(outputRecords);
    Assert.assertEquals(expectedCounts, outputRecords);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) ServiceManager(io.cdap.cdap.test.ServiceManager) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) URL(java.net.URL) ProgramManager(io.cdap.cdap.test.ProgramManager)

Example 14 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class FileUploadServiceTestRun method testFileUploadService.

@Test
public void testFileUploadService() throws Exception {
    ApplicationManager appManager = deployApplication(FileUploadApp.class);
    // Start the service
    ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
    serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    try {
        // Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
        URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
        // Upload with wrong MD5, should get 400.
        byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
        Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
        long beforeUploadTime = System.currentTimeMillis();
        // Upload with right MD5, should get 200
        Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, Base64.getEncoder().encodeToString(Hashing.md5().hashBytes(content).asBytes()), 20));
        // Inspect the partitioned file set and verify the content
        PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
        PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
        Assert.assertNotNull(partition);
        // Verify a notification should have been published for the new partition
        List<Notification> notifications = getDataNotifications(beforeUploadTime);
        // Should have one message
        Assert.assertEquals(1, notifications.size());
        verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
        // There should be one file under the partition directory
        List<Location> locations = partition.getLocation().list();
        Assert.assertEquals(1, locations.size());
        Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
        // Verify the tracking table of chunks sizes
        KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
        CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
        // Sum up all chunks sizes as being tracked by the tracking table.
        long sum = 0;
        int iterSize = 0;
        while (iter.hasNext()) {
            KeyValue<byte[], byte[]> kv = iter.next();
            sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
            iterSize++;
        }
        // The iterator should have size >= 2, since we uses different chunk size for two different upload
        Assert.assertTrue(iterSize >= 2);
        // The sum of all chunks sizes should be the same as the
        // content size * 2 (since we have one failure, one success upload)
        Assert.assertEquals(content.length * 2, sum);
    } finally {
        serviceManager.stop();
        serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) URI(java.net.URI) Notification(io.cdap.cdap.proto.Notification) ServiceManager(io.cdap.cdap.test.ServiceManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 15 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by cdapio.

the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMR.

private void runDynamicPartitionerMR(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, final boolean precreatePartitions, @Nullable final DynamicPartitioner.PartitionWriteOption partitionWriteOption, boolean expectedStatus) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
    final long now = System.currentTimeMillis();
    final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
    // write values to the input kvTable
    final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
    Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            // the keys are not used; it matters that they're unique though
            for (int i = 0; i < records.size(); i++) {
                kvTable.write(Integer.toString(i), records.get(i).toString());
            }
        }
    });
    final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
    if (precreatePartitions) {
        Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws IOException {
                writeFile(pfs, createKey(now, 95111));
                writeFile(pfs, createKey(now, 98123));
                writeFile(pfs, createKey(now, 84125));
            }
        });
    }
    String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
    // run the partition writer m/r with this output partition time
    Map<String, String> arguments = new HashMap<>();
    arguments.put(OUTPUT_PARTITION_KEY, Long.toString(now));
    arguments.put(allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
    if (partitionWriteOption != null) {
        arguments.put("partitionWriteOption", partitionWriteOption.name());
    }
    long startTime = System.currentTimeMillis();
    boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
    Assert.assertEquals(expectedStatus, status);
    if (!expectedStatus) {
        // if we expect the program to fail, no need to check the output data for expected results
        return;
    }
    // Verify notifications
    List<Notification> notifications = getDataNotifications(startTime);
    Assert.assertEquals(1, notifications.size());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
    // this should have created a partition in the pfs
    final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws IOException {
            Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
            for (PartitionDetail partition : pfs.getPartitions(null)) {
                partitions.put(partition.getPartitionKey(), partition);
                // check that the mapreduce wrote the output partition metadata to all the output partitions
                Assert.assertEquals(getExpectedMetadata(precreatePartitions, partitionWriteOption), partition.getMetadata().asMap());
                // if files were precreated, and the option is to append, expect the empty file to exist
                // if partition write option is configured to overwrite, then the file is expected to not exist
                Location preexistingFile = partition.getLocation().append("file");
                if (precreatePartitions && partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_APPEND) {
                    Assert.assertTrue(preexistingFile.exists());
                    try (InputStream inputStream = preexistingFile.getInputStream()) {
                        Assert.assertEquals(-1, inputStream.read());
                    }
                } else {
                    Assert.assertFalse(preexistingFile.exists());
                }
            }
            Assert.assertEquals(3, partitions.size());
            Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
            // Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
            for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
                PartitionDetail partitionDetail = partitionKeyEntry.getValue();
                String relativePath = partitionDetail.getRelativePath();
                int zip = (int) partitionKeyEntry.getKey().getField("zip");
                Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
                Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
            }
            for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
                Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
                Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
            }
        }
    });
}
Also used : HashSet(java.util.HashSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) Set(java.util.Set) HashMap(java.util.HashMap) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) Notification(io.cdap.cdap.proto.Notification) ApplicationWithPrograms(io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) GenericRecord(org.apache.avro.generic.GenericRecord) InputStream(java.io.InputStream) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Location(org.apache.twill.filesystem.Location)

Aggregations

PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)45 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)28 PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)23 Test (org.junit.Test)20 TransactionAware (org.apache.tephra.TransactionAware)18 TransactionExecutor (org.apache.tephra.TransactionExecutor)18 IOException (java.io.IOException)14 HashMap (java.util.HashMap)13 DataSetException (io.cdap.cdap.api.dataset.DataSetException)12 Location (org.apache.twill.filesystem.Location)11 PartitionNotFoundException (io.cdap.cdap.api.dataset.PartitionNotFoundException)10 PartitionAlreadyExistsException (io.cdap.cdap.api.dataset.lib.PartitionAlreadyExistsException)10 PartitionOutput (io.cdap.cdap.api.dataset.lib.PartitionOutput)10 HashSet (java.util.HashSet)10 List (java.util.List)8 PartitionFilter (io.cdap.cdap.api.dataset.lib.PartitionFilter)7 ArrayList (java.util.ArrayList)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Predicate (io.cdap.cdap.api.Predicate)6 Partition (io.cdap.cdap.api.dataset.lib.Partition)6