Search in sources :

Example 11 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class SparkTest method testClassicSpark.

@Test
public void testClassicSpark() throws Exception {
    ApplicationManager appManager = deploy(TestSparkApp.class);
    for (Class<?> sparkClass : Arrays.asList(TestSparkApp.ClassicSpark.class, TestSparkApp.ScalaClassicSpark.class)) {
        final SparkManager sparkManager = appManager.getSparkManager(sparkClass.getSimpleName()).start();
        sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    }
    KeyValueTable resultTable = this.<KeyValueTable>getDataset("ResultTable").get();
    Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ClassicSparkProgram.class.getName())));
    Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ScalaClassicSparkProgram.class.getName())));
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) TestSparkApp(co.cask.cdap.spark.app.TestSparkApp) ScalaClassicSparkProgram(co.cask.cdap.spark.app.ScalaClassicSparkProgram) ScalaClassicSparkProgram(co.cask.cdap.spark.app.ScalaClassicSparkProgram) ClassicSparkProgram(co.cask.cdap.spark.app.ClassicSparkProgram) Test(org.junit.Test)

Example 12 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class SparkTest method testTransaction.

@Test
public void testTransaction() throws Exception {
    ApplicationManager applicationManager = deploy(TestSparkApp.class);
    StreamManager streamManager = getStreamManager("SparkStream");
    // Write some sentences to the stream
    streamManager.send("red fox");
    streamManager.send("brown fox");
    streamManager.send("grey fox");
    streamManager.send("brown bear");
    streamManager.send("black bear");
    // Run the spark program
    SparkManager sparkManager = applicationManager.getSparkManager(TransactionSpark.class.getSimpleName());
    sparkManager.start(ImmutableMap.of("source.stream", "SparkStream", "keyvalue.table", "KeyValueTable", "result.all.dataset", "SparkResult", "result.threshold", "2", "result.threshold.dataset", "SparkThresholdResult"));
    // Verify result from dataset before the Spark program terminates
    final DataSetManager<KeyValueTable> resultManager = getDataset("SparkThresholdResult");
    final KeyValueTable resultTable = resultManager.get();
    // Expect the threshold result dataset, with threshold >=2, contains [brown, fox, bear]
    Tasks.waitFor(ImmutableSet.of("brown", "fox", "bear"), new Callable<Set<String>>() {

        @Override
        public Set<String> call() throws Exception {
            // This is to start a new TX
            resultManager.flush();
            LOG.info("Reading from threshold result");
            try (CloseableIterator<KeyValue<byte[], byte[]>> itor = resultTable.scan(null, null)) {
                return ImmutableSet.copyOf(Iterators.transform(itor, input -> {
                    String word = Bytes.toString(input.getKey());
                    LOG.info("{}, {}", word, Bytes.toInt(input.getValue()));
                    return word;
                }));
            }
        }
    }, 3, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);
    sparkManager.stop();
    sparkManager.waitForRun(ProgramRunStatus.KILLED, 60, TimeUnit.SECONDS);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) CloseableIterator(co.cask.cdap.api.dataset.lib.CloseableIterator) SparkManager(co.cask.cdap.test.SparkManager) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) FileSet(co.cask.cdap.api.dataset.lib.FileSet) StreamManager(co.cask.cdap.test.StreamManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) IOException(java.io.IOException) TransactionSpark(co.cask.cdap.spark.app.TransactionSpark) Test(org.junit.Test)

Example 13 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class SparkTest method testDynamicSpark.

@Test
public void testDynamicSpark() throws Exception {
    ApplicationManager appManager = deploy(TestSparkApp.class);
    // Populate data into the stream
    StreamManager streamManager = getStreamManager("SparkStream");
    for (int i = 0; i < 10; i++) {
        streamManager.send("Line " + (i + 1));
    }
    SparkManager sparkManager = appManager.getSparkManager(ScalaDynamicSpark.class.getSimpleName());
    sparkManager.start(ImmutableMap.of("input", "SparkStream", "output", "ResultTable", "tmpdir", TMP_FOLDER.newFolder().getAbsolutePath()));
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // Validate the result written to dataset
    KeyValueTable resultTable = this.<KeyValueTable>getDataset("ResultTable").get();
    // There should be ten "Line"
    Assert.assertEquals(10, Bytes.toInt(resultTable.read("Line")));
    // Each number should appear once
    for (int i = 0; i < 10; i++) {
        Assert.assertEquals(1, Bytes.toInt(resultTable.read(Integer.toString(i + 1))));
    }
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) StreamManager(co.cask.cdap.test.StreamManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ScalaDynamicSpark(co.cask.cdap.spark.app.ScalaDynamicSpark) Test(org.junit.Test)

Example 14 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class SparkLogParser method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    Map<String, String> runtimeArguments = sec.getRuntimeArguments();
    String inputFileSet = runtimeArguments.get("input");
    final String outputTable = runtimeArguments.get("output");
    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
    final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {

        @Override
        public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
            return SparkAppUsingGetDataset.parse(input._2());
        }
    }).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {

        @Override
        public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
            return stats1.aggregate(stats2);
        }
    }).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {

        @Override
        public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
            final Gson gson = new Gson();
            return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {

                @Override
                public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
                    return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
                }
            }));
        }
    });
    // Collect all data to driver and write to dataset directly. That's the intend of the test.
    sec.execute(new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable kvTable = context.getDataset(outputTable);
            for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
                kvTable.write(entry.getKey(), entry.getValue());
            }
        }
    });
}
Also used : Gson(com.google.gson.Gson) TxRunnable(co.cask.cdap.api.TxRunnable) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) DatasetContext(co.cask.cdap.api.data.DatasetContext) LogKey(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogKey) Text(org.apache.hadoop.io.Text) Function2(org.apache.spark.api.java.function.Function2) LogStats(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogStats) Tuple2(scala.Tuple2) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable)

Example 15 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class FileUploadServiceTestRun method testFileUploadService.

@Test
public void testFileUploadService() throws Exception {
    ApplicationManager appManager = deployApplication(FileUploadApp.class);
    // Start the service
    ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
    serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    try {
        // Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
        URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
        // Upload with wrong MD5, should get 400.
        byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
        Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
        long beforeUploadTime = System.currentTimeMillis();
        // Upload with right MD5, should get 200
        Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, BaseEncoding.base64().encode(Hashing.md5().hashBytes(content).asBytes()), 20));
        // Inspect the partitioned file set and verify the content
        PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
        PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
        Assert.assertNotNull(partition);
        // Verify a notification should have been published for the new partition
        List<Notification> notifications = getDataNotifications(beforeUploadTime);
        // Should have one message
        Assert.assertEquals(1, notifications.size());
        verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
        // There should be one file under the partition directory
        List<Location> locations = partition.getLocation().list();
        Assert.assertEquals(1, locations.size());
        Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
        // Verify the tracking table of chunks sizes
        KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
        CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
        // Sum up all chunks sizes as being tracked by the tracking table.
        long sum = 0;
        int iterSize = 0;
        while (iter.hasNext()) {
            KeyValue<byte[], byte[]> kv = iter.next();
            sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
            iterSize++;
        }
        // The iterator should have size >= 2, since we uses different chunk size for two different upload
        Assert.assertTrue(iterSize >= 2);
        // The sum of all chunks sizes should be the same as the
        // content size * 2 (since we have one failure, one success upload)
        Assert.assertEquals(content.length * 2, sum);
    } finally {
        serviceManager.stop();
        serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
    }
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) URI(java.net.URI) Notification(co.cask.cdap.proto.Notification) ServiceManager(co.cask.cdap.test.ServiceManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Aggregations

KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)84 Test (org.junit.Test)49 ApplicationManager (co.cask.cdap.test.ApplicationManager)45 SparkManager (co.cask.cdap.test.SparkManager)25 StreamManager (co.cask.cdap.test.StreamManager)16 IOException (java.io.IOException)16 TransactionExecutor (org.apache.tephra.TransactionExecutor)12 ApplicationWithPrograms (co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms)11 HashMap (java.util.HashMap)11 ArrayList (java.util.ArrayList)10 FileSet (co.cask.cdap.api.dataset.lib.FileSet)8 KeyValue (co.cask.cdap.api.dataset.lib.KeyValue)8 Table (co.cask.cdap.api.dataset.table.Table)8 NamespaceMeta (co.cask.cdap.proto.NamespaceMeta)8 ObjectStore (co.cask.cdap.api.dataset.lib.ObjectStore)7 MapReduceManager (co.cask.cdap.test.MapReduceManager)7 ServiceManager (co.cask.cdap.test.ServiceManager)7 WorkflowManager (co.cask.cdap.test.WorkflowManager)7 Set (java.util.Set)7 Category (org.junit.experimental.categories.Category)7