use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.
the class FileUploadServiceTestRun method testFileUploadService.
@Test
public void testFileUploadService() throws Exception {
ApplicationManager appManager = deployApplication(FileUploadApp.class);
// Start the service
ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
try {
// Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
// Upload with wrong MD5, should get 400.
byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
long beforeUploadTime = System.currentTimeMillis();
// Upload with right MD5, should get 200
Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, Base64.getEncoder().encodeToString(Hashing.md5().hashBytes(content).asBytes()), 20));
// Inspect the partitioned file set and verify the content
PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
Assert.assertNotNull(partition);
// Verify a notification should have been published for the new partition
List<Notification> notifications = getDataNotifications(beforeUploadTime);
// Should have one message
Assert.assertEquals(1, notifications.size());
verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
// There should be one file under the partition directory
List<Location> locations = partition.getLocation().list();
Assert.assertEquals(1, locations.size());
Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
// Verify the tracking table of chunks sizes
KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
// Sum up all chunks sizes as being tracked by the tracking table.
long sum = 0;
int iterSize = 0;
while (iter.hasNext()) {
KeyValue<byte[], byte[]> kv = iter.next();
sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
iterSize++;
}
// The iterator should have size >= 2, since we uses different chunk size for two different upload
Assert.assertTrue(iterSize >= 2);
// The sum of all chunks sizes should be the same as the
// content size * 2 (since we have one failure, one success upload)
Assert.assertEquals(content.length * 2, sum);
} finally {
serviceManager.stop();
serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
}
}
use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.
the class Spark2Test method testSparkWithLocalFiles.
private void testSparkWithLocalFiles(Class<? extends Application> appClass, String sparkProgram, String prefix) throws Exception {
ApplicationManager applicationManager = deploy(NamespaceId.DEFAULT, appClass);
URI localFile = createLocalPropertiesFile(prefix);
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(Collections.singletonMap(SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG, localFile.toString()));
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
sparkManager.waitForStopped(120, TimeUnit.SECONDS);
DataSetManager<KeyValueTable> kvTableManager = getDataset(SparkAppUsingLocalFiles.OUTPUT_DATASET_NAME);
KeyValueTable kvTable = kvTableManager.get();
Map<String, String> expected = ImmutableMap.of("a", "1", "b", "2", "c", "3");
List<byte[]> deleteKeys = new ArrayList<>();
try (CloseableIterator<KeyValue<byte[], byte[]>> scan = kvTable.scan(null, null)) {
for (int i = 0; i < 3; i++) {
KeyValue<byte[], byte[]> next = scan.next();
Assert.assertEquals(expected.get(Bytes.toString(next.getKey())), Bytes.toString(next.getValue()));
deleteKeys.add(next.getKey());
}
Assert.assertFalse(scan.hasNext());
}
// Cleanup after run
kvTableManager.flush();
for (byte[] key : deleteKeys) {
kvTable.delete(key);
}
kvTableManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.
the class ObjectMappedTableDatasetTest method testScan.
@Test
public void testScan() throws Exception {
dsFrameworkUtil.createInstance(ObjectMappedTable.class.getName(), RECORDS_ID, ObjectMappedTableProperties.builder().setType(Record.class).build());
try {
final ObjectMappedTableDataset<Record> records = dsFrameworkUtil.getInstance(RECORDS_ID);
TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor((TransactionAware) records);
Record record1 = new Record(Integer.MAX_VALUE, Long.MAX_VALUE, Float.MAX_VALUE, Double.MAX_VALUE, "foobar", Bytes.toBytes("foobar"), ByteBuffer.wrap(Bytes.toBytes("foobar")), UUID.randomUUID());
Record record2 = new Record(Integer.MIN_VALUE, Long.MIN_VALUE, Float.MIN_VALUE, Double.MIN_VALUE, "baz", Bytes.toBytes("baz"), ByteBuffer.wrap(Bytes.toBytes("baz")), UUID.randomUUID());
Record record3 = new Record(1, 0L, 3.14f, 3.14159265358979323846, "hello", Bytes.toBytes("world"), ByteBuffer.wrap(Bytes.toBytes("yo")), UUID.randomUUID());
final List<KeyValue<byte[], Record>> recordList = Lists.newArrayList();
recordList.add(new KeyValue<>(Bytes.toBytes("123"), record1));
recordList.add(new KeyValue<>(Bytes.toBytes("456"), record2));
recordList.add(new KeyValue<>(Bytes.toBytes("789"), record3));
for (final KeyValue<byte[], Record> record : recordList) {
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
records.write(record.getKey(), record.getValue());
}
});
}
final List<KeyValue<byte[], Record>> actualList = Lists.newArrayList();
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
CloseableIterator<KeyValue<byte[], Record>> results = records.scan((String) null, null);
while (results.hasNext()) {
actualList.add(results.next());
}
results.close();
}
});
Assert.assertEquals(recordList.size(), actualList.size());
for (int i = 0; i < actualList.size(); i++) {
KeyValue<byte[], Record> expected = recordList.get(i);
KeyValue<byte[], Record> actual = actualList.get(i);
Assert.assertArrayEquals(expected.getKey(), actual.getKey());
Assert.assertEquals(expected.getValue(), actual.getValue());
}
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
CloseableIterator<KeyValue<byte[], Record>> results = records.scan("789", null);
KeyValue<byte[], Record> actualRecord = results.next();
Assert.assertFalse(results.hasNext());
Assert.assertArrayEquals(actualRecord.getKey(), recordList.get(2).getKey());
Assert.assertEquals(actualRecord.getValue(), recordList.get(2).getValue());
results.close();
results = records.scan(null, "124");
actualRecord = results.next();
Assert.assertFalse(results.hasNext());
Assert.assertArrayEquals(actualRecord.getKey(), recordList.get(0).getKey());
Assert.assertEquals(actualRecord.getValue(), recordList.get(0).getValue());
results.close();
results = records.scan(null, "123");
Assert.assertFalse(results.hasNext());
results.close();
}
});
actualList.clear();
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
Scan scan = new Scan(null, null);
CloseableIterator<KeyValue<byte[], Record>> results = records.scan(scan);
while (results.hasNext()) {
actualList.add(results.next());
}
}
});
Assert.assertEquals(recordList.size(), actualList.size());
} finally {
dsFrameworkUtil.deleteInstance(RECORDS_ID);
}
}
use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.
the class SparkTest method testSparkWithGetDataset.
private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
ApplicationManager applicationManager = deploy(appClass);
DataSetManager<FileSet> filesetManager = getDataset("logs");
FileSet fileset = filesetManager.get();
Location location = fileset.getLocation("nn");
prepareInputFileSetWithLogData(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
args.put("input", "logs");
args.put("output", "logStats");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram);
sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
KeyValueTable logStatsTable = logStatsManager.get();
validateGetDatasetOutput(logStatsTable);
// Cleanup after run
location.delete(true);
logStatsManager.flush();
try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
while (scan.hasNext()) {
logStatsTable.delete(scan.next().getKey());
}
}
logStatsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.KeyValue in project cdap by caskdata.
the class SparkBatchSinkFactory method writeFromRDD.
/**
* Write the given RDD using one or more OutputFormats or CDAP datasets.
* Returns the names of the outputs written using OutputFormatProvider, which need to register lineage.
*/
public <K, V> Set<String> writeFromRDD(JavaPairRDD<K, V> rdd, JavaSparkExecutionContext sec, String sinkName) {
Set<String> outputNames = sinkOutputs.get(sinkName);
if (outputNames == null || outputNames.isEmpty()) {
// should never happen if validation happened correctly at pipeline configure time
throw new IllegalArgumentException(sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
}
Set<String> lineageNames = new HashSet<>();
Map<String, OutputFormatProvider> outputFormats = new HashMap<>();
for (String outputName : outputNames) {
NamedOutputFormatProvider outputFormatProvider = outputFormatProviders.get(outputName);
if (outputFormatProvider != null) {
outputFormats.put(outputName, outputFormatProvider);
lineageNames.add(outputFormatProvider.name);
}
DatasetInfo datasetInfo = datasetInfos.get(outputName);
if (datasetInfo != null) {
sec.saveAsDataset(rdd, datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
}
}
if (outputFormats.isEmpty()) {
return lineageNames;
}
if (outputFormats.size() == 1) {
RDDUtils.saveUsingOutputFormat(outputFormats.values().iterator().next(), rdd);
return lineageNames;
}
Configuration hConf = new Configuration();
Map<String, Set<String>> sinkOutputs = Collections.singletonMap(sinkName, outputFormats.keySet());
MultiOutputFormat.addOutputs(hConf, outputFormats, sinkOutputs);
hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, MultiOutputFormat.class.getName());
// MultiOutputFormat requires the key to be the sink name and the value to be the actual key-value to
// send to the delegate output format.
JavaPairRDD<String, KeyValue<K, V>> multiRDD = rdd.mapToPair(kv -> new Tuple2<>(sinkName, new KeyValue<>(kv._1(), kv._2())));
RDDUtils.saveHadoopDataset(multiRDD, hConf);
return lineageNames;
}
Aggregations