Search in sources :

Example 6 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class FileUploadServiceTestRun method testFileUploadService.

@Test
public void testFileUploadService() throws Exception {
    ApplicationManager appManager = deployApplication(FileUploadApp.class);
    // Start the service
    ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
    serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    try {
        // Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
        URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
        // Upload with wrong MD5, should get 400.
        byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
        Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
        long beforeUploadTime = System.currentTimeMillis();
        // Upload with right MD5, should get 200
        Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, Base64.getEncoder().encodeToString(Hashing.md5().hashBytes(content).asBytes()), 20));
        // Inspect the partitioned file set and verify the content
        PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
        PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
        Assert.assertNotNull(partition);
        // Verify a notification should have been published for the new partition
        List<Notification> notifications = getDataNotifications(beforeUploadTime);
        // Should have one message
        Assert.assertEquals(1, notifications.size());
        verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
        // There should be one file under the partition directory
        List<Location> locations = partition.getLocation().list();
        Assert.assertEquals(1, locations.size());
        Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
        // Verify the tracking table of chunks sizes
        KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
        CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
        // Sum up all chunks sizes as being tracked by the tracking table.
        long sum = 0;
        int iterSize = 0;
        while (iter.hasNext()) {
            KeyValue<byte[], byte[]> kv = iter.next();
            sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
            iterSize++;
        }
        // The iterator should have size >= 2, since we uses different chunk size for two different upload
        Assert.assertTrue(iterSize >= 2);
        // The sum of all chunks sizes should be the same as the
        // content size * 2 (since we have one failure, one success upload)
        Assert.assertEquals(content.length * 2, sum);
    } finally {
        serviceManager.stop();
        serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) URI(java.net.URI) Notification(io.cdap.cdap.proto.Notification) ServiceManager(io.cdap.cdap.test.ServiceManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 7 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class Spark2Test method testSparkWithLocalFiles.

private void testSparkWithLocalFiles(Class<? extends Application> appClass, String sparkProgram, String prefix) throws Exception {
    ApplicationManager applicationManager = deploy(NamespaceId.DEFAULT, appClass);
    URI localFile = createLocalPropertiesFile(prefix);
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(Collections.singletonMap(SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG, localFile.toString()));
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    sparkManager.waitForStopped(120, TimeUnit.SECONDS);
    DataSetManager<KeyValueTable> kvTableManager = getDataset(SparkAppUsingLocalFiles.OUTPUT_DATASET_NAME);
    KeyValueTable kvTable = kvTableManager.get();
    Map<String, String> expected = ImmutableMap.of("a", "1", "b", "2", "c", "3");
    List<byte[]> deleteKeys = new ArrayList<>();
    try (CloseableIterator<KeyValue<byte[], byte[]>> scan = kvTable.scan(null, null)) {
        for (int i = 0; i < 3; i++) {
            KeyValue<byte[], byte[]> next = scan.next();
            Assert.assertEquals(expected.get(Bytes.toString(next.getKey())), Bytes.toString(next.getValue()));
            deleteKeys.add(next.getKey());
        }
        Assert.assertFalse(scan.hasNext());
    }
    // Cleanup after run
    kvTableManager.flush();
    for (byte[] key : deleteKeys) {
        kvTable.delete(key);
    }
    kvTableManager.flush();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) ArrayList(java.util.ArrayList) URI(java.net.URI) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable)

Example 8 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class ObjectMappedTableDatasetTest method testScan.

@Test
public void testScan() throws Exception {
    dsFrameworkUtil.createInstance(ObjectMappedTable.class.getName(), RECORDS_ID, ObjectMappedTableProperties.builder().setType(Record.class).build());
    try {
        final ObjectMappedTableDataset<Record> records = dsFrameworkUtil.getInstance(RECORDS_ID);
        TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor((TransactionAware) records);
        Record record1 = new Record(Integer.MAX_VALUE, Long.MAX_VALUE, Float.MAX_VALUE, Double.MAX_VALUE, "foobar", Bytes.toBytes("foobar"), ByteBuffer.wrap(Bytes.toBytes("foobar")), UUID.randomUUID());
        Record record2 = new Record(Integer.MIN_VALUE, Long.MIN_VALUE, Float.MIN_VALUE, Double.MIN_VALUE, "baz", Bytes.toBytes("baz"), ByteBuffer.wrap(Bytes.toBytes("baz")), UUID.randomUUID());
        Record record3 = new Record(1, 0L, 3.14f, 3.14159265358979323846, "hello", Bytes.toBytes("world"), ByteBuffer.wrap(Bytes.toBytes("yo")), UUID.randomUUID());
        final List<KeyValue<byte[], Record>> recordList = Lists.newArrayList();
        recordList.add(new KeyValue<>(Bytes.toBytes("123"), record1));
        recordList.add(new KeyValue<>(Bytes.toBytes("456"), record2));
        recordList.add(new KeyValue<>(Bytes.toBytes("789"), record3));
        for (final KeyValue<byte[], Record> record : recordList) {
            txnl.execute(new TransactionExecutor.Subroutine() {

                @Override
                public void apply() throws Exception {
                    records.write(record.getKey(), record.getValue());
                }
            });
        }
        final List<KeyValue<byte[], Record>> actualList = Lists.newArrayList();
        txnl.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                CloseableIterator<KeyValue<byte[], Record>> results = records.scan((String) null, null);
                while (results.hasNext()) {
                    actualList.add(results.next());
                }
                results.close();
            }
        });
        Assert.assertEquals(recordList.size(), actualList.size());
        for (int i = 0; i < actualList.size(); i++) {
            KeyValue<byte[], Record> expected = recordList.get(i);
            KeyValue<byte[], Record> actual = actualList.get(i);
            Assert.assertArrayEquals(expected.getKey(), actual.getKey());
            Assert.assertEquals(expected.getValue(), actual.getValue());
        }
        txnl.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                CloseableIterator<KeyValue<byte[], Record>> results = records.scan("789", null);
                KeyValue<byte[], Record> actualRecord = results.next();
                Assert.assertFalse(results.hasNext());
                Assert.assertArrayEquals(actualRecord.getKey(), recordList.get(2).getKey());
                Assert.assertEquals(actualRecord.getValue(), recordList.get(2).getValue());
                results.close();
                results = records.scan(null, "124");
                actualRecord = results.next();
                Assert.assertFalse(results.hasNext());
                Assert.assertArrayEquals(actualRecord.getKey(), recordList.get(0).getKey());
                Assert.assertEquals(actualRecord.getValue(), recordList.get(0).getValue());
                results.close();
                results = records.scan(null, "123");
                Assert.assertFalse(results.hasNext());
                results.close();
            }
        });
        actualList.clear();
        txnl.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                Scan scan = new Scan(null, null);
                CloseableIterator<KeyValue<byte[], Record>> results = records.scan(scan);
                while (results.hasNext()) {
                    actualList.add(results.next());
                }
            }
        });
        Assert.assertEquals(recordList.size(), actualList.size());
    } finally {
        dsFrameworkUtil.deleteInstance(RECORDS_ID);
    }
}
Also used : CloseableIterator(io.cdap.cdap.api.dataset.lib.CloseableIterator) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) TransactionExecutor(org.apache.tephra.TransactionExecutor) Scan(io.cdap.cdap.api.dataset.table.Scan) ObjectMappedTable(io.cdap.cdap.api.dataset.lib.ObjectMappedTable) Test(org.junit.Test)

Example 9 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class SparkTest method testSparkWithGetDataset.

private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
    ApplicationManager applicationManager = deploy(appClass);
    DataSetManager<FileSet> filesetManager = getDataset("logs");
    FileSet fileset = filesetManager.get();
    Location location = fileset.getLocation("nn");
    prepareInputFileSetWithLogData(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
    args.put("input", "logs");
    args.put("output", "logStats");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram);
    sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
    KeyValueTable logStatsTable = logStatsManager.get();
    validateGetDatasetOutput(logStatsTable);
    // Cleanup after run
    location.delete(true);
    logStatsManager.flush();
    try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
        while (scan.hasNext()) {
            logStatsTable.delete(scan.next().getKey());
        }
    }
    logStatsManager.flush();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location)

Example 10 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class SparkBatchSinkFactory method writeFromRDD.

/**
 * Write the given RDD using one or more OutputFormats or CDAP datasets.
 * Returns the names of the outputs written using OutputFormatProvider, which need to register lineage.
 */
public <K, V> Set<String> writeFromRDD(JavaPairRDD<K, V> rdd, JavaSparkExecutionContext sec, String sinkName) {
    Set<String> outputNames = sinkOutputs.get(sinkName);
    if (outputNames == null || outputNames.isEmpty()) {
        // should never happen if validation happened correctly at pipeline configure time
        throw new IllegalArgumentException(sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
    }
    Set<String> lineageNames = new HashSet<>();
    Map<String, OutputFormatProvider> outputFormats = new HashMap<>();
    for (String outputName : outputNames) {
        NamedOutputFormatProvider outputFormatProvider = outputFormatProviders.get(outputName);
        if (outputFormatProvider != null) {
            outputFormats.put(outputName, outputFormatProvider);
            lineageNames.add(outputFormatProvider.name);
        }
        DatasetInfo datasetInfo = datasetInfos.get(outputName);
        if (datasetInfo != null) {
            sec.saveAsDataset(rdd, datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
        }
    }
    if (outputFormats.isEmpty()) {
        return lineageNames;
    }
    if (outputFormats.size() == 1) {
        RDDUtils.saveUsingOutputFormat(outputFormats.values().iterator().next(), rdd);
        return lineageNames;
    }
    Configuration hConf = new Configuration();
    Map<String, Set<String>> sinkOutputs = Collections.singletonMap(sinkName, outputFormats.keySet());
    MultiOutputFormat.addOutputs(hConf, outputFormats, sinkOutputs);
    hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, MultiOutputFormat.class.getName());
    // MultiOutputFormat requires the key to be the sink name and the value to be the actual key-value to
    // send to the delegate output format.
    JavaPairRDD<String, KeyValue<K, V>> multiRDD = rdd.mapToPair(kv -> new Tuple2<>(sinkName, new KeyValue<>(kv._1(), kv._2())));
    RDDUtils.saveHadoopDataset(multiRDD, hConf);
    return lineageNames;
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) MultiOutputFormat(io.cdap.cdap.etl.common.output.MultiOutputFormat) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider) HashSet(java.util.HashSet)

Aggregations

KeyValue (io.cdap.cdap.api.dataset.lib.KeyValue)10 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)6 ApplicationManager (io.cdap.cdap.test.ApplicationManager)4 Location (org.apache.twill.filesystem.Location)3 Test (org.junit.Test)3 Gson (com.google.gson.Gson)2 CloseableIterator (io.cdap.cdap.api.dataset.lib.CloseableIterator)2 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)2 ObjectMappedTable (io.cdap.cdap.api.dataset.lib.ObjectMappedTable)2 SparkAppUsingGetDataset (io.cdap.cdap.spark.app.SparkAppUsingGetDataset)2 SparkManager (io.cdap.cdap.test.SparkManager)2 HashMap (java.util.HashMap)2 Joiner (com.google.common.base.Joiner)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Iterables (com.google.common.collect.Iterables)1 Iterators (com.google.common.collect.Iterators)1 Maps (com.google.common.collect.Maps)1 ByteStreams (com.google.common.io.ByteStreams)1 TypeToken (com.google.gson.reflect.TypeToken)1