Search in sources :

Example 36 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.

the class SparkTest method testSparkWithGetDataset.

private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
    ApplicationManager applicationManager = deploy(appClass);
    DataSetManager<FileSet> filesetManager = getDataset("logs");
    FileSet fileset = filesetManager.get();
    Location location = fileset.getLocation("nn");
    prepareInputFileSetWithLogData(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
    args.put("input", "logs");
    args.put("output", "logStats");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram);
    sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
    KeyValueTable logStatsTable = logStatsManager.get();
    validateGetDatasetOutput(logStatsTable);
    // Cleanup after run
    location.delete(true);
    logStatsManager.flush();
    try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
        while (scan.hasNext()) {
            logStatsTable.delete(scan.next().getKey());
        }
    }
    logStatsManager.flush();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location)

Example 37 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.

the class SparkTest method testClassicSpark.

@Test
public void testClassicSpark() throws Exception {
    ApplicationManager appManager = deploy(TestSparkApp.class);
    for (Class<?> sparkClass : Arrays.asList(TestSparkApp.ClassicSpark.class, TestSparkApp.ScalaClassicSpark.class)) {
        final SparkManager sparkManager = appManager.getSparkManager(sparkClass.getSimpleName());
        sparkManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    }
    KeyValueTable resultTable = this.<KeyValueTable>getDataset("ResultTable").get();
    Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ClassicSparkProgram.class.getName())));
    Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ScalaClassicSparkProgram.class.getName())));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) TestSparkApp(io.cdap.cdap.spark.app.TestSparkApp) ScalaClassicSparkProgram(io.cdap.cdap.spark.app.ScalaClassicSparkProgram) ClassicSparkProgram(io.cdap.cdap.spark.app.ClassicSparkProgram) ScalaClassicSparkProgram(io.cdap.cdap.spark.app.ScalaClassicSparkProgram) Test(org.junit.Test)

Example 38 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.

the class SparkTest method testTransaction.

@Test
public void testTransaction() throws Exception {
    ApplicationManager applicationManager = deploy(TestSparkApp.class);
    // Write some data to a local file
    File inputFile = TEMP_FOLDER.newFile();
    try (PrintWriter writer = new PrintWriter(Files.newBufferedWriter(inputFile.toPath(), StandardCharsets.UTF_8))) {
        writer.println("red fox");
        writer.println("brown fox");
        writer.println("grey fox");
        writer.println("brown bear");
        writer.println("black bear");
    }
    // Run the spark program
    SparkManager sparkManager = applicationManager.getSparkManager(TransactionSpark.class.getSimpleName());
    sparkManager.start(ImmutableMap.of("input.file", inputFile.getAbsolutePath(), "keyvalue.table", "KeyValueTable", "result.all.dataset", "SparkResult", "result.threshold", "2", "result.threshold.dataset", "SparkThresholdResult"));
    // Verify result from dataset before the Spark program terminates
    final DataSetManager<KeyValueTable> resultManager = getDataset("SparkThresholdResult");
    final KeyValueTable resultTable = resultManager.get();
    // Expect the threshold result dataset, with threshold >=2, contains [brown, fox, bear]
    Tasks.waitFor(ImmutableSet.of("brown", "fox", "bear"), () -> {
        // This is to start a new TX
        resultManager.flush();
        LOG.info("Reading from threshold result");
        try (CloseableIterator<KeyValue<byte[], byte[]>> itor = resultTable.scan(null, null)) {
            return ImmutableSet.copyOf(Iterators.transform(itor, input -> {
                String word = Bytes.toString(input.getKey());
                LOG.info("{}, {}", word, Bytes.toInt(input.getValue()));
                return word;
            }));
        }
    }, 3, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);
    sparkManager.stop();
    sparkManager.waitForRun(ProgramRunStatus.KILLED, 60, TimeUnit.SECONDS);
}
Also used : HttpURLConnection(java.net.HttpURLConnection) SparkLogParser(io.cdap.cdap.spark.app.SparkLogParser) Arrays(java.util.Arrays) TypeToken(com.google.gson.reflect.TypeToken) ClassicSparkProgram(io.cdap.cdap.spark.app.ClassicSparkProgram) NamespaceId(io.cdap.cdap.proto.id.NamespaceId) PythonSpark(io.cdap.cdap.spark.app.PythonSpark) URL(java.net.URL) LoggerFactory(org.slf4j.LoggerFactory) Bytes(io.cdap.cdap.api.common.Bytes) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) DatasetSQLSpark(io.cdap.cdap.spark.app.DatasetSQLSpark) ScheduleId(io.cdap.cdap.proto.id.ScheduleId) Gson(com.google.gson.Gson) Map(java.util.Map) ClassRule(org.junit.ClassRule) Scope(io.cdap.cdap.api.common.Scope) Application(io.cdap.cdap.api.app.Application) StringLengthUDT(io.cdap.cdap.spark.app.plugin.StringLengthUDT) PrintWriter(java.io.PrintWriter) Tasks(io.cdap.cdap.common.utils.Tasks) ImmutableSet(com.google.common.collect.ImmutableSet) TestSparkApp(io.cdap.cdap.spark.app.TestSparkApp) IdentityHashMap(java.util.IdentityHashMap) ImmutableMap(com.google.common.collect.ImmutableMap) ObjectMappedTable(io.cdap.cdap.api.dataset.lib.ObjectMappedTable) TransactionSpark(io.cdap.cdap.spark.app.TransactionSpark) Reader(java.io.Reader) ProgramRunStatus(io.cdap.cdap.proto.ProgramRunStatus) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) SparkServiceProgram(io.cdap.cdap.spark.app.SparkServiceProgram) List(java.util.List) Person(io.cdap.cdap.spark.app.Person) Stream(java.util.stream.Stream) ApplicationManager(io.cdap.cdap.test.ApplicationManager) ByteStreams(com.google.common.io.ByteStreams) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) DataSetManager(io.cdap.cdap.test.DataSetManager) Constants(io.cdap.cdap.common.conf.Constants) FileSetArguments(io.cdap.cdap.api.dataset.lib.FileSetArguments) DirUtils(io.cdap.cdap.common.utils.DirUtils) SparkAppUsingGetDataset(io.cdap.cdap.spark.app.SparkAppUsingGetDataset) Joiner(com.google.common.base.Joiner) Iterables(com.google.common.collect.Iterables) StringLengthFunc(io.cdap.cdap.spark.app.plugin.StringLengthFunc) BeforeClass(org.junit.BeforeClass) Location(org.apache.twill.filesystem.Location) TestConfiguration(io.cdap.cdap.test.TestConfiguration) PluggableFunc(io.cdap.cdap.spark.app.plugin.PluggableFunc) HashMap(java.util.HashMap) Function(java.util.function.Function) Iterators(com.google.common.collect.Iterators) ArrayList(java.util.ArrayList) OutputStreamWriter(java.io.OutputStreamWriter) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) PrintStream(java.io.PrintStream) SparkManager(io.cdap.cdap.test.SparkManager) Logger(org.slf4j.Logger) Files(java.nio.file.Files) ScalaDynamicSpark(io.cdap.cdap.spark.app.ScalaDynamicSpark) BufferedWriter(java.io.BufferedWriter) RuntimeArguments(io.cdap.cdap.api.common.RuntimeArguments) Test(org.junit.Test) IOException(java.io.IOException) Maps(com.google.common.collect.Maps) CloseableIterator(io.cdap.cdap.api.dataset.lib.CloseableIterator) InputStreamReader(java.io.InputStreamReader) ScalaSparkLogParser(io.cdap.cdap.spark.app.ScalaSparkLogParser) File(java.io.File) ScalaClassicSparkProgram(io.cdap.cdap.spark.app.ScalaClassicSparkProgram) TestFrameworkTestBase(io.cdap.cdap.test.base.TestFrameworkTestBase) TimeUnit(java.util.concurrent.TimeUnit) URLEncoder(java.net.URLEncoder) Ignore(org.junit.Ignore) WorkflowManager(io.cdap.cdap.test.WorkflowManager) Assert(org.junit.Assert) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) InputStream(java.io.InputStream) ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) File(java.io.File) PrintWriter(java.io.PrintWriter) TransactionSpark(io.cdap.cdap.spark.app.TransactionSpark) Test(org.junit.Test)

Example 39 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.

the class TestFrameworkTestRun method executeWorkflow.

private String executeWorkflow(ApplicationManager applicationManager, Map<String, String> additionalParams, int expectedComplete) throws Exception {
    WorkflowManager wfManager = applicationManager.getWorkflowManager(WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
    Map<String, String> runtimeArgs = new HashMap<>();
    File waitFile = new File(TMP_FOLDER.newFolder(), "/wait.file");
    File doneFile = new File(TMP_FOLDER.newFolder(), "/done.file");
    runtimeArgs.put("input.path", "input");
    runtimeArgs.put("output.path", "output");
    runtimeArgs.put("wait.file", waitFile.getAbsolutePath());
    runtimeArgs.put("done.file", doneFile.getAbsolutePath());
    runtimeArgs.putAll(additionalParams);
    wfManager.start(runtimeArgs);
    // Wait until custom action in the Workflow is triggered.
    while (!waitFile.exists()) {
        TimeUnit.MILLISECONDS.sleep(50);
    }
    // Now the Workflow should have RUNNING status. Get its runid.
    List<RunRecord> history = wfManager.getHistory(ProgramRunStatus.RUNNING);
    Assert.assertEquals(1, history.size());
    String runId = history.get(0).getPid();
    // Get the local datasets for this Workflow run
    DataSetManager<KeyValueTable> localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET + "." + runId));
    Assert.assertEquals("2", Bytes.toString(localDataset.get().read("text")));
    DataSetManager<FileSet> fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET + "." + runId));
    Assert.assertNotNull(fileSetDataset.get());
    // Local datasets should not exist at the namespace level
    localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET));
    Assert.assertNull(localDataset.get());
    fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET));
    Assert.assertNull(fileSetDataset.get());
    // Verify that the workflow hasn't completed on its own before we signal it to
    history = wfManager.getHistory(ProgramRunStatus.RUNNING);
    Assert.assertEquals(1, history.size());
    // Signal the Workflow to continue
    doneFile.createNewFile();
    // Wait for workflow to finish
    wfManager.waitForRuns(ProgramRunStatus.COMPLETED, expectedComplete, 1, TimeUnit.MINUTES);
    Map<String, WorkflowNodeStateDetail> nodeStateDetailMap = wfManager.getWorkflowNodeStates(runId);
    Map<String, String> workflowMetricsContext = new HashMap<>();
    workflowMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    workflowMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    workflowMetricsContext.put(Constants.Metrics.Tag.WORKFLOW, WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
    workflowMetricsContext.put(Constants.Metrics.Tag.RUN_ID, runId);
    Map<String, String> writerContext = new HashMap<>(workflowMetricsContext);
    writerContext.put(Constants.Metrics.Tag.NODE, WorkflowAppWithLocalDatasets.LocalDatasetWriter.class.getSimpleName());
    Assert.assertEquals(2, getMetricsManager().getTotalMetric(writerContext, "user.num.lines"));
    Map<String, String> wfSparkMetricsContext = new HashMap<>(workflowMetricsContext);
    wfSparkMetricsContext.put(Constants.Metrics.Tag.NODE, "JavaSparkCSVToSpaceConverter");
    Assert.assertEquals(2, getMetricsManager().getTotalMetric(wfSparkMetricsContext, "user.num.lines"));
    // check in spark context
    Map<String, String> sparkMetricsContext = new HashMap<>();
    sparkMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    sparkMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    sparkMetricsContext.put(Constants.Metrics.Tag.SPARK, "JavaSparkCSVToSpaceConverter");
    sparkMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("JavaSparkCSVToSpaceConverter").getRunId());
    Assert.assertEquals(2, getMetricsManager().getTotalMetric(sparkMetricsContext, "user.num.lines"));
    Map<String, String> appMetricsContext = new HashMap<>();
    appMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    appMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    // app metrics context should have sum from custom action and spark metrics.
    Assert.assertEquals(4, getMetricsManager().getTotalMetric(appMetricsContext, "user.num.lines"));
    Map<String, String> wfMRMetricsContext = new HashMap<>(workflowMetricsContext);
    wfMRMetricsContext.put(Constants.Metrics.Tag.NODE, "WordCount");
    Assert.assertEquals(7, getMetricsManager().getTotalMetric(wfMRMetricsContext, "user.num.words"));
    // mr metrics context
    Map<String, String> mrMetricsContext = new HashMap<>();
    mrMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    mrMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    mrMetricsContext.put(Constants.Metrics.Tag.MAPREDUCE, "WordCount");
    mrMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("WordCount").getRunId());
    Assert.assertEquals(7, getMetricsManager().getTotalMetric(mrMetricsContext, "user.num.words"));
    final Map<String, String> readerContext = new HashMap<>(workflowMetricsContext);
    readerContext.put(Constants.Metrics.Tag.NODE, "readerAction");
    Tasks.waitFor(6L, new Callable<Long>() {

        @Override
        public Long call() throws Exception {
            return getMetricsManager().getTotalMetric(readerContext, "user.unique.words");
        }
    }, 60, TimeUnit.SECONDS);
    return runId;
}
Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) WorkflowManager(io.cdap.cdap.test.WorkflowManager) IOException(java.io.IOException) ConflictException(io.cdap.cdap.common.ConflictException) WorkflowNodeStateDetail(io.cdap.cdap.proto.WorkflowNodeStateDetail) RunRecord(io.cdap.cdap.proto.RunRecord) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) File(java.io.File)

Example 40 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.

the class SparkLogParser method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    Map<String, String> runtimeArguments = sec.getRuntimeArguments();
    String inputFileSet = runtimeArguments.get("input");
    final String outputTable = runtimeArguments.get("output");
    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
    final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {

        @Override
        public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
            return SparkAppUsingGetDataset.parse(input._2());
        }
    }).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {

        @Override
        public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
            return stats1.aggregate(stats2);
        }
    }).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {

        @Override
        public Iterator<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
            final Gson gson = new Gson();
            return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {

                @Override
                public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
                    return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
                }
            })).iterator();
        }
    });
    // Collect all data to driver and write to dataset directly. That's the intend of the test.
    sec.execute(new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable kvTable = context.getDataset(outputTable);
            for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
                kvTable.write(entry.getKey(), entry.getValue());
            }
        }
    });
}
Also used : LogKey(io.cdap.cdap.spark.app.SparkAppUsingGetDataset.LogKey) Gson(com.google.gson.Gson) Text(org.apache.hadoop.io.Text) Function2(org.apache.spark.api.java.function.Function2) LogStats(io.cdap.cdap.spark.app.SparkAppUsingGetDataset.LogStats) Function(com.google.common.base.Function) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) PairFunction(org.apache.spark.api.java.function.PairFunction) Tuple2(scala.Tuple2) TxRunnable(io.cdap.cdap.api.TxRunnable) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) DatasetContext(io.cdap.cdap.api.data.DatasetContext)

Aggregations

KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)122 Test (org.junit.Test)65 ApplicationManager (io.cdap.cdap.test.ApplicationManager)59 HashMap (java.util.HashMap)27 SparkManager (io.cdap.cdap.test.SparkManager)26 Table (io.cdap.cdap.api.dataset.table.Table)21 TransactionExecutor (org.apache.tephra.TransactionExecutor)20 WorkflowManager (io.cdap.cdap.test.WorkflowManager)19 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)18 ApplicationWithPrograms (io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms)18 KeyValue (io.cdap.cdap.api.dataset.lib.KeyValue)14 ServiceManager (io.cdap.cdap.test.ServiceManager)14 IOException (java.io.IOException)14 ArrayList (java.util.ArrayList)14 Location (org.apache.twill.filesystem.Location)14 ImmutableMap (com.google.common.collect.ImmutableMap)13 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)13 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)13 File (java.io.File)13 URL (java.net.URL)12