Search in sources :

Example 71 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMR.

private void runDynamicPartitionerMR(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, final boolean precreatePartitions, @Nullable final DynamicPartitioner.PartitionWriteOption partitionWriteOption, boolean expectedStatus) throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
    final long now = System.currentTimeMillis();
    final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
    // write values to the input kvTable
    final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
    Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            // the keys are not used; it matters that they're unique though
            for (int i = 0; i < records.size(); i++) {
                kvTable.write(Integer.toString(i), records.get(i).toString());
            }
        }
    });
    final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
    if (precreatePartitions) {
        Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws IOException {
                writeFile(pfs, createKey(now, 95111));
                writeFile(pfs, createKey(now, 98123));
                writeFile(pfs, createKey(now, 84125));
            }
        });
    }
    String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
    // run the partition writer m/r with this output partition time
    Map<String, String> arguments = new HashMap<>();
    arguments.put(OUTPUT_PARTITION_KEY, Long.toString(now));
    arguments.put(allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
    if (partitionWriteOption != null) {
        arguments.put("partitionWriteOption", partitionWriteOption.name());
    }
    long startTime = System.currentTimeMillis();
    boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
    Assert.assertEquals(expectedStatus, status);
    if (!expectedStatus) {
        // if we expect the program to fail, no need to check the output data for expected results
        return;
    }
    // Verify notifications
    List<Notification> notifications = getDataNotifications(startTime);
    Assert.assertEquals(1, notifications.size());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
    // this should have created a partition in the pfs
    final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws IOException {
            Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
            for (PartitionDetail partition : pfs.getPartitions(null)) {
                partitions.put(partition.getPartitionKey(), partition);
                // check that the mapreduce wrote the output partition metadata to all the output partitions
                Assert.assertEquals(getExpectedMetadata(precreatePartitions, partitionWriteOption), partition.getMetadata().asMap());
                // if files were precreated, and the option is to append, expect the empty file to exist
                // if partition write option is configured to overwrite, then the file is expected to not exist
                Location preexistingFile = partition.getLocation().append("file");
                if (precreatePartitions && partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_APPEND) {
                    Assert.assertTrue(preexistingFile.exists());
                    try (InputStream inputStream = preexistingFile.getInputStream()) {
                        Assert.assertEquals(-1, inputStream.read());
                    }
                } else {
                    Assert.assertFalse(preexistingFile.exists());
                }
            }
            Assert.assertEquals(3, partitions.size());
            Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
            // Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
            for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
                PartitionDetail partitionDetail = partitionKeyEntry.getValue();
                String relativePath = partitionDetail.getRelativePath();
                int zip = (int) partitionKeyEntry.getKey().getField("zip");
                Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
                Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
            }
            for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
                Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
                Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
            }
        }
    });
}
Also used : HashSet(java.util.HashSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) Set(java.util.Set) HashMap(java.util.HashMap) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) Notification(io.cdap.cdap.proto.Notification) ApplicationWithPrograms(io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) GenericRecord(org.apache.avro.generic.GenericRecord) InputStream(java.io.InputStream) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Location(org.apache.twill.filesystem.Location)

Example 72 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class ObjectStoreDatasetTest method testWithCustomClassLoader.

@Test
public void testWithCustomClassLoader() throws Exception {
    DatasetId kv = DatasetFrameworkTestUtil.NAMESPACE_ID.dataset("kv");
    // create a dummy class loader that records the name of the class it loaded
    final AtomicReference<String> lastClassLoaded = new AtomicReference<>(null);
    ClassLoader loader = new ClassLoader() {

        @Override
        public Class<?> loadClass(String name) throws ClassNotFoundException {
            lastClassLoaded.set(name);
            return super.loadClass(name);
        }
    };
    dsFrameworkUtil.createInstance("keyValueTable", kv, DatasetProperties.EMPTY);
    KeyValueTable kvTable = dsFrameworkUtil.getInstance(kv);
    Type type = Custom.class;
    TypeRepresentation typeRep = new TypeRepresentation(type);
    Schema schema = new ReflectionSchemaGenerator().generate(type);
    final ObjectStoreDataset<Custom> objectStore = new ObjectStoreDataset<>("kv", kvTable, typeRep, schema, loader);
    TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor(objectStore);
    // need to call this to actually load the Custom class, because the Custom class is no longer used in the
    // ObjectStoreDataset's constructor, but rather lazily when its actually needed.
    objectStore.getRecordType();
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            objectStore.write("dummy", new Custom(382, Lists.newArrayList("blah")));
        }
    });
    // verify the class name was recorded (the dummy class loader was used).
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            Assert.assertEquals(Custom.class.getName(), lastClassLoaded.get());
        }
    });
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            deleteAndVerify(objectStore, Bytes.toBytes("dummy"));
        }
    });
    dsFrameworkUtil.deleteInstance(kv);
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) AtomicReference(java.util.concurrent.atomic.AtomicReference) TransactionExecutor(org.apache.tephra.TransactionExecutor) ReflectionSchemaGenerator(io.cdap.cdap.internal.io.ReflectionSchemaGenerator) TransactionFailureException(org.apache.tephra.TransactionFailureException) NoSuchElementException(java.util.NoSuchElementException) DatasetId(io.cdap.cdap.proto.id.DatasetId) Type(java.lang.reflect.Type) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) TypeRepresentation(io.cdap.cdap.internal.io.TypeRepresentation) Test(org.junit.Test)

Example 73 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class DataStreamsSparkSinkTest method testSparkSink.

private void testSparkSink(ApplicationManager appManager, final String output) throws Exception {
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start(ImmutableMap.of("tablename", output));
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            return getDataset(output).get() != null;
        }
    }, 1, TimeUnit.MINUTES);
    final DataSetManager<KeyValueTable> outputManager = getDataset(output);
    final Map<String, String> expectedKeyValues = ImmutableMap.of("0", "samuel", "1", "jackson", "2", "dwayne", "3", "johnson");
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            Map<String, String> keyValues = io.cdap.cdap.etl.mock.spark.streaming.MockSink.getValues(expectedKeyValues.keySet(), outputManager);
            return expectedKeyValues.equals(keyValues);
        }
    }, 1, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(10, TimeUnit.SECONDS);
    sparkManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
}
Also used : SparkManager(io.cdap.cdap.test.SparkManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 74 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class SparkTest method testDynamicSpark.

@Test
public void testDynamicSpark() throws Exception {
    ApplicationManager appManager = deploy(TestSparkApp.class);
    // Write some data to a local file
    File inputFile = TEMP_FOLDER.newFile();
    try (BufferedWriter writer = Files.newBufferedWriter(inputFile.toPath(), StandardCharsets.UTF_8)) {
        for (int i = 0; i < 10; i++) {
            writer.write("Line " + (i + 1));
            writer.newLine();
        }
    }
    SparkManager sparkManager = appManager.getSparkManager(ScalaDynamicSpark.class.getSimpleName());
    sparkManager.startAndWaitForGoodRun(ImmutableMap.of("input", inputFile.getAbsolutePath(), "output", "ResultTable", "tmpdir", TMP_FOLDER.newFolder().getAbsolutePath()), ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // Validate the result written to dataset
    KeyValueTable resultTable = this.<KeyValueTable>getDataset("ResultTable").get();
    // There should be ten "Line"
    Assert.assertEquals(10, Bytes.toInt(resultTable.read("Line")));
    // Each number should appear once
    for (int i = 0; i < 10; i++) {
        Assert.assertEquals(1, Bytes.toInt(resultTable.read(Integer.toString(i + 1))));
    }
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) ScalaDynamicSpark(io.cdap.cdap.spark.app.ScalaDynamicSpark) File(java.io.File) BufferedWriter(java.io.BufferedWriter) Test(org.junit.Test)

Example 75 with KeyValueTable

use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class SparkTest method testSparkWithGetDataset.

private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
    ApplicationManager applicationManager = deploy(appClass);
    DataSetManager<FileSet> filesetManager = getDataset("logs");
    FileSet fileset = filesetManager.get();
    Location location = fileset.getLocation("nn");
    prepareInputFileSetWithLogData(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
    args.put("input", "logs");
    args.put("output", "logStats");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram);
    sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
    KeyValueTable logStatsTable = logStatsManager.get();
    validateGetDatasetOutput(logStatsTable);
    // Cleanup after run
    location.delete(true);
    logStatsManager.flush();
    try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
        while (scan.hasNext()) {
            logStatsTable.delete(scan.next().getKey());
        }
    }
    logStatsManager.flush();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location)

Aggregations

KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)122 Test (org.junit.Test)65 ApplicationManager (io.cdap.cdap.test.ApplicationManager)59 HashMap (java.util.HashMap)27 SparkManager (io.cdap.cdap.test.SparkManager)26 Table (io.cdap.cdap.api.dataset.table.Table)21 TransactionExecutor (org.apache.tephra.TransactionExecutor)20 WorkflowManager (io.cdap.cdap.test.WorkflowManager)19 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)18 ApplicationWithPrograms (io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms)18 KeyValue (io.cdap.cdap.api.dataset.lib.KeyValue)14 ServiceManager (io.cdap.cdap.test.ServiceManager)14 IOException (java.io.IOException)14 ArrayList (java.util.ArrayList)14 Location (org.apache.twill.filesystem.Location)14 ImmutableMap (com.google.common.collect.ImmutableMap)13 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)13 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)13 File (java.io.File)13 URL (java.net.URL)12