Search in sources :

Example 1 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class BaseRDDCollection method createMultiStoreTask.

@Override
public Runnable createMultiStoreTask(PhaseSpec phaseSpec, Set<String> group, Set<String> sinks, Map<String, StageStatisticsCollector> collectors) {
    return new Runnable() {

        @Override
        public void run() {
            PairFlatMapFunction<T, String, KeyValue<Object, Object>> multiSinkFunction = (PairFlatMapFunction<T, String, KeyValue<Object, Object>>) new MultiSinkFunction(sec, phaseSpec, group, collectors);
            JavaPairRDD<String, KeyValue<Object, Object>> taggedOutput = rdd.flatMapToPair(multiSinkFunction);
            for (String outputName : sinkFactory.writeCombinedRDD(taggedOutput, sec, sinks)) {
                recordLineage(outputName);
            }
        }
    };
}
Also used : KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) MultiSinkFunction(io.cdap.cdap.etl.spark.function.MultiSinkFunction)

Example 2 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class TestFrameworkTestRun method assertWorkerDatasetWrites.

private void assertWorkerDatasetWrites(byte[] startRow, byte[] endRow, int expectedCount, int expectedTotalCount) throws Exception {
    DataSetManager<KeyValueTable> datasetManager = getDataset(testSpace.dataset(AppUsingGetServiceURL.WORKER_INSTANCES_DATASET));
    KeyValueTable instancesTable = datasetManager.get();
    try (CloseableIterator<KeyValue<byte[], byte[]>> instancesIterator = instancesTable.scan(startRow, endRow)) {
        List<KeyValue<byte[], byte[]>> workerInstances = Lists.newArrayList(instancesIterator);
        // Assert that the worker starts with expectedCount instances
        Assert.assertEquals(expectedCount, workerInstances.size());
        // Assert that each instance of the worker knows the total number of instances
        for (KeyValue<byte[], byte[]> keyValue : workerInstances) {
            Assert.assertEquals(expectedTotalCount, Bytes.toInt(keyValue.getValue()));
        }
    }
}
Also used : KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable)

Example 3 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class AuthorizationTest method assertDatasetIsEmpty.

private void assertDatasetIsEmpty(NamespaceId namespaceId, String datasetName) throws Exception {
    DataSetManager<KeyValueTable> outTableManager = getDataset(namespaceId.dataset(datasetName));
    KeyValueTable outputTable = outTableManager.get();
    try (CloseableIterator<KeyValue<byte[], byte[]>> scanner = outputTable.scan(null, null)) {
        Assert.assertFalse(scanner.hasNext());
    }
}
Also used : KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable)

Example 4 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class SparkTest method testTransaction.

@Test
public void testTransaction() throws Exception {
    ApplicationManager applicationManager = deploy(TestSparkApp.class);
    // Write some data to a local file
    File inputFile = TEMP_FOLDER.newFile();
    try (PrintWriter writer = new PrintWriter(Files.newBufferedWriter(inputFile.toPath(), StandardCharsets.UTF_8))) {
        writer.println("red fox");
        writer.println("brown fox");
        writer.println("grey fox");
        writer.println("brown bear");
        writer.println("black bear");
    }
    // Run the spark program
    SparkManager sparkManager = applicationManager.getSparkManager(TransactionSpark.class.getSimpleName());
    sparkManager.start(ImmutableMap.of("input.file", inputFile.getAbsolutePath(), "keyvalue.table", "KeyValueTable", "result.all.dataset", "SparkResult", "result.threshold", "2", "result.threshold.dataset", "SparkThresholdResult"));
    // Verify result from dataset before the Spark program terminates
    final DataSetManager<KeyValueTable> resultManager = getDataset("SparkThresholdResult");
    final KeyValueTable resultTable = resultManager.get();
    // Expect the threshold result dataset, with threshold >=2, contains [brown, fox, bear]
    Tasks.waitFor(ImmutableSet.of("brown", "fox", "bear"), () -> {
        // This is to start a new TX
        resultManager.flush();
        LOG.info("Reading from threshold result");
        try (CloseableIterator<KeyValue<byte[], byte[]>> itor = resultTable.scan(null, null)) {
            return ImmutableSet.copyOf(Iterators.transform(itor, input -> {
                String word = Bytes.toString(input.getKey());
                LOG.info("{}, {}", word, Bytes.toInt(input.getValue()));
                return word;
            }));
        }
    }, 3, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);
    sparkManager.stop();
    sparkManager.waitForRun(ProgramRunStatus.KILLED, 60, TimeUnit.SECONDS);
}
Also used : HttpURLConnection(java.net.HttpURLConnection) SparkLogParser(io.cdap.cdap.spark.app.SparkLogParser) Arrays(java.util.Arrays) TypeToken(com.google.gson.reflect.TypeToken) ClassicSparkProgram(io.cdap.cdap.spark.app.ClassicSparkProgram) NamespaceId(io.cdap.cdap.proto.id.NamespaceId) PythonSpark(io.cdap.cdap.spark.app.PythonSpark) URL(java.net.URL) LoggerFactory(org.slf4j.LoggerFactory) Bytes(io.cdap.cdap.api.common.Bytes) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) DatasetSQLSpark(io.cdap.cdap.spark.app.DatasetSQLSpark) ScheduleId(io.cdap.cdap.proto.id.ScheduleId) Gson(com.google.gson.Gson) Map(java.util.Map) ClassRule(org.junit.ClassRule) Scope(io.cdap.cdap.api.common.Scope) Application(io.cdap.cdap.api.app.Application) StringLengthUDT(io.cdap.cdap.spark.app.plugin.StringLengthUDT) PrintWriter(java.io.PrintWriter) Tasks(io.cdap.cdap.common.utils.Tasks) ImmutableSet(com.google.common.collect.ImmutableSet) TestSparkApp(io.cdap.cdap.spark.app.TestSparkApp) IdentityHashMap(java.util.IdentityHashMap) ImmutableMap(com.google.common.collect.ImmutableMap) ObjectMappedTable(io.cdap.cdap.api.dataset.lib.ObjectMappedTable) TransactionSpark(io.cdap.cdap.spark.app.TransactionSpark) Reader(java.io.Reader) ProgramRunStatus(io.cdap.cdap.proto.ProgramRunStatus) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) SparkServiceProgram(io.cdap.cdap.spark.app.SparkServiceProgram) List(java.util.List) Person(io.cdap.cdap.spark.app.Person) Stream(java.util.stream.Stream) ApplicationManager(io.cdap.cdap.test.ApplicationManager) ByteStreams(com.google.common.io.ByteStreams) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) DataSetManager(io.cdap.cdap.test.DataSetManager) Constants(io.cdap.cdap.common.conf.Constants) FileSetArguments(io.cdap.cdap.api.dataset.lib.FileSetArguments) DirUtils(io.cdap.cdap.common.utils.DirUtils) SparkAppUsingGetDataset(io.cdap.cdap.spark.app.SparkAppUsingGetDataset) Joiner(com.google.common.base.Joiner) Iterables(com.google.common.collect.Iterables) StringLengthFunc(io.cdap.cdap.spark.app.plugin.StringLengthFunc) BeforeClass(org.junit.BeforeClass) Location(org.apache.twill.filesystem.Location) TestConfiguration(io.cdap.cdap.test.TestConfiguration) PluggableFunc(io.cdap.cdap.spark.app.plugin.PluggableFunc) HashMap(java.util.HashMap) Function(java.util.function.Function) Iterators(com.google.common.collect.Iterators) ArrayList(java.util.ArrayList) OutputStreamWriter(java.io.OutputStreamWriter) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) PrintStream(java.io.PrintStream) SparkManager(io.cdap.cdap.test.SparkManager) Logger(org.slf4j.Logger) Files(java.nio.file.Files) ScalaDynamicSpark(io.cdap.cdap.spark.app.ScalaDynamicSpark) BufferedWriter(java.io.BufferedWriter) RuntimeArguments(io.cdap.cdap.api.common.RuntimeArguments) Test(org.junit.Test) IOException(java.io.IOException) Maps(com.google.common.collect.Maps) CloseableIterator(io.cdap.cdap.api.dataset.lib.CloseableIterator) InputStreamReader(java.io.InputStreamReader) ScalaSparkLogParser(io.cdap.cdap.spark.app.ScalaSparkLogParser) File(java.io.File) ScalaClassicSparkProgram(io.cdap.cdap.spark.app.ScalaClassicSparkProgram) TestFrameworkTestBase(io.cdap.cdap.test.base.TestFrameworkTestBase) TimeUnit(java.util.concurrent.TimeUnit) URLEncoder(java.net.URLEncoder) Ignore(org.junit.Ignore) WorkflowManager(io.cdap.cdap.test.WorkflowManager) Assert(org.junit.Assert) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) InputStream(java.io.InputStream) ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) File(java.io.File) PrintWriter(java.io.PrintWriter) TransactionSpark(io.cdap.cdap.spark.app.TransactionSpark) Test(org.junit.Test)

Example 5 with KeyValue

use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.

the class SparkTest method validateGetDatasetOutput.

private void validateGetDatasetOutput(KeyValueTable logStatsTable) {
    SparkAppUsingGetDataset.LogKey fredKey1 = new SparkAppUsingGetDataset.LogKey("10.10.10.10", "FRED", "GET http://bar.com/image.jpg HTTP/1.1", 200);
    SparkAppUsingGetDataset.LogKey fredKey2 = new SparkAppUsingGetDataset.LogKey("10.10.10.10", "FRED", "GET http://bar.com/image.jpg HTTP/1.1", 404);
    SparkAppUsingGetDataset.LogKey bradKey1 = new SparkAppUsingGetDataset.LogKey("20.20.20.20", "BRAD", "GET http://bar.com/image.jpg HTTP/1.1", 200);
    SparkAppUsingGetDataset.LogKey bradKey2 = new SparkAppUsingGetDataset.LogKey("20.20.20.20", "BRAD", "GET http://bar.com/image.jpg HTTP/1.1", 404);
    SparkAppUsingGetDataset.LogStats fredStats1 = new SparkAppUsingGetDataset.LogStats(2, 100);
    SparkAppUsingGetDataset.LogStats fredStats2 = new SparkAppUsingGetDataset.LogStats(1, 50);
    SparkAppUsingGetDataset.LogStats bradStats1 = new SparkAppUsingGetDataset.LogStats(1, 50);
    SparkAppUsingGetDataset.LogStats bradStats2 = new SparkAppUsingGetDataset.LogStats(1, 50);
    Map<SparkAppUsingGetDataset.LogKey, SparkAppUsingGetDataset.LogStats> expected = ImmutableMap.of(fredKey1, fredStats1, fredKey2, fredStats2, bradKey1, bradStats1, bradKey2, bradStats2);
    try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
        // must have 4 records
        for (int i = 0; i < 4; i++) {
            Assert.assertTrue("Expected next for i = " + i, scan.hasNext());
            KeyValue<byte[], byte[]> next = scan.next();
            SparkAppUsingGetDataset.LogKey logKey = new Gson().fromJson(Bytes.toString(next.getKey()), SparkAppUsingGetDataset.LogKey.class);
            SparkAppUsingGetDataset.LogStats logStats = new Gson().fromJson(Bytes.toString(next.getValue()), SparkAppUsingGetDataset.LogStats.class);
            Assert.assertEquals(expected.get(logKey), logStats);
        }
        // no more records
        Assert.assertFalse(scan.hasNext());
    }
}
Also used : KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) SparkAppUsingGetDataset(io.cdap.cdap.spark.app.SparkAppUsingGetDataset) Gson(com.google.gson.Gson)

Aggregations

KeyValue (io.cdap.cdap.api.dataset.lib.KeyValue)10 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)6 ApplicationManager (io.cdap.cdap.test.ApplicationManager)4 Location (org.apache.twill.filesystem.Location)3 Test (org.junit.Test)3 Gson (com.google.gson.Gson)2 CloseableIterator (io.cdap.cdap.api.dataset.lib.CloseableIterator)2 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)2 ObjectMappedTable (io.cdap.cdap.api.dataset.lib.ObjectMappedTable)2 SparkAppUsingGetDataset (io.cdap.cdap.spark.app.SparkAppUsingGetDataset)2 SparkManager (io.cdap.cdap.test.SparkManager)2 HashMap (java.util.HashMap)2 Joiner (com.google.common.base.Joiner)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Iterables (com.google.common.collect.Iterables)1 Iterators (com.google.common.collect.Iterators)1 Maps (com.google.common.collect.Maps)1 ByteStreams (com.google.common.io.ByteStreams)1 TypeToken (com.google.gson.reflect.TypeToken)1