use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.
private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
pfsManager.flush();
Map<String, String> inputArgs = new HashMap<>();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
Map<String, String> outputArgs = new HashMap<>();
PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
args.put("input", "pfs");
args.put("output", "pfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
pfsManager.flush();
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
// Cleanup after test completed
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());
pfsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithTimePartitionedFileSet.
private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
long customOutputPartitionKey = 123456789L;
long customInputPartitionKey = 987654321L;
DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
long inputTime = System.currentTimeMillis();
long outputTime = inputTime + TimeUnit.HOURS.toMillis(1);
addTimePartition(tpfsManager, inputTime);
addTimePartition(tpfsManager, customInputPartitionKey);
Map<String, String> inputArgs = new HashMap<>();
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100);
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100);
Map<String, String> outputArgs = new HashMap<>();
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs));
args.put("input", "tpfs");
args.put("output", "tpfs");
args.put("outputKey", String.valueOf(customOutputPartitionKey));
args.put("inputKey", String.valueOf(customInputPartitionKey));
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
tpfsManager.flush();
TimePartitionedFileSet tpfs = tpfsManager.get();
PartitionDetail partition = tpfs.getPartitionByTime(outputTime);
Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition);
validateFileOutput(partition.getLocation());
PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey);
Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition);
validateFileOutput(customPartition.getLocation());
// Cleanup after running the test
tpfs.dropPartition(inputTime);
tpfs.dropPartition(customInputPartitionKey);
tpfs.dropPartition(partition.getPartitionKey());
tpfs.dropPartition(customPartition.getPartitionKey());
tpfsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionConsumingTestRun method testWordCountOnFileSet.
private void testWordCountOnFileSet(Function<ApplicationManager, ProgramManager> runProgram, boolean produceOutputPartitionEachRun) throws Exception {
ApplicationManager applicationManager = deployApplication(AppWithPartitionConsumers.class);
ServiceManager serviceManager = applicationManager.getServiceManager("DatasetService").start();
serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
URL serviceURL = serviceManager.getServiceURL();
// write a file to the file set using the service and run the WordCount MapReduce job on that one partition
createPartition(serviceURL, LINE1, "1");
ProgramManager programManager = runProgram.apply(applicationManager);
programManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(2), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(1), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(0), getCount(serviceURL, "c"));
// create two additional partitions
createPartition(serviceURL, LINE2, "2");
createPartition(serviceURL, LINE3, "3");
// running the program job now processes these two new partitions (LINE2 and LINE3) and updates the counts
// dataset accordingly
programManager = runProgram.apply(applicationManager);
programManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
// running the program without adding new partitions does not affect the counts dataset
programManager = runProgram.apply(applicationManager);
programManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
DataSetManager<PartitionedFileSet> outputLines = getDataset("outputLines");
Set<PartitionDetail> partitions = outputLines.get().getPartitions(PartitionFilter.ALWAYS_MATCH);
// each of the three MapReduce runs produces an output partition (even if there's no input data)
// however, Worker run doesn't produce a new output partition if there's no new input partition
Assert.assertEquals(produceOutputPartitionEachRun ? 3 : 2, partitions.size());
// we only store the counts to the "outputLines" dataset
List<String> expectedCounts = Lists.newArrayList("1", "1", "2", "2", "3");
List<String> outputRecords = getDataFromExplore("outputLines");
Collections.sort(outputRecords);
Assert.assertEquals(expectedCounts, outputRecords);
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class FileUploadServiceTestRun method testFileUploadService.
@Test
public void testFileUploadService() throws Exception {
ApplicationManager appManager = deployApplication(FileUploadApp.class);
// Start the service
ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
try {
// Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
// Upload with wrong MD5, should get 400.
byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
long beforeUploadTime = System.currentTimeMillis();
// Upload with right MD5, should get 200
Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, Base64.getEncoder().encodeToString(Hashing.md5().hashBytes(content).asBytes()), 20));
// Inspect the partitioned file set and verify the content
PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
Assert.assertNotNull(partition);
// Verify a notification should have been published for the new partition
List<Notification> notifications = getDataNotifications(beforeUploadTime);
// Should have one message
Assert.assertEquals(1, notifications.size());
verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
// There should be one file under the partition directory
List<Location> locations = partition.getLocation().list();
Assert.assertEquals(1, locations.size());
Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
// Verify the tracking table of chunks sizes
KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
// Sum up all chunks sizes as being tracked by the tracking table.
long sum = 0;
int iterSize = 0;
while (iter.hasNext()) {
KeyValue<byte[], byte[]> kv = iter.next();
sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
iterSize++;
}
// The iterator should have size >= 2, since we uses different chunk size for two different upload
Assert.assertTrue(iterSize >= 2);
// The sum of all chunks sizes should be the same as the
// content size * 2 (since we have one failure, one success upload)
Assert.assertEquals(content.length * 2, sum);
} finally {
serviceManager.stop();
serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
}
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by cdapio.
the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMR.
private void runDynamicPartitionerMR(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, final boolean precreatePartitions, @Nullable final DynamicPartitioner.PartitionWriteOption partitionWriteOption, boolean expectedStatus) throws Exception {
ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
final long now = System.currentTimeMillis();
final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
// write values to the input kvTable
final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
// the keys are not used; it matters that they're unique though
for (int i = 0; i < records.size(); i++) {
kvTable.write(Integer.toString(i), records.get(i).toString());
}
}
});
final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
if (precreatePartitions) {
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws IOException {
writeFile(pfs, createKey(now, 95111));
writeFile(pfs, createKey(now, 98123));
writeFile(pfs, createKey(now, 84125));
}
});
}
String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
// run the partition writer m/r with this output partition time
Map<String, String> arguments = new HashMap<>();
arguments.put(OUTPUT_PARTITION_KEY, Long.toString(now));
arguments.put(allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
if (partitionWriteOption != null) {
arguments.put("partitionWriteOption", partitionWriteOption.name());
}
long startTime = System.currentTimeMillis();
boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
Assert.assertEquals(expectedStatus, status);
if (!expectedStatus) {
// if we expect the program to fail, no need to check the output data for expected results
return;
}
// Verify notifications
List<Notification> notifications = getDataNotifications(startTime);
Assert.assertEquals(1, notifications.size());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
// this should have created a partition in the pfs
final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws IOException {
Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
for (PartitionDetail partition : pfs.getPartitions(null)) {
partitions.put(partition.getPartitionKey(), partition);
// check that the mapreduce wrote the output partition metadata to all the output partitions
Assert.assertEquals(getExpectedMetadata(precreatePartitions, partitionWriteOption), partition.getMetadata().asMap());
// if files were precreated, and the option is to append, expect the empty file to exist
// if partition write option is configured to overwrite, then the file is expected to not exist
Location preexistingFile = partition.getLocation().append("file");
if (precreatePartitions && partitionWriteOption == DynamicPartitioner.PartitionWriteOption.CREATE_OR_APPEND) {
Assert.assertTrue(preexistingFile.exists());
try (InputStream inputStream = preexistingFile.getInputStream()) {
Assert.assertEquals(-1, inputStream.read());
}
} else {
Assert.assertFalse(preexistingFile.exists());
}
}
Assert.assertEquals(3, partitions.size());
Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
// Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
PartitionDetail partitionDetail = partitionKeyEntry.getValue();
String relativePath = partitionDetail.getRelativePath();
int zip = (int) partitionKeyEntry.getKey().getField("zip");
Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
}
for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
}
}
});
}
Aggregations