use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class ExploreExecutorHttpHandler method doDropPartition.
private void doDropPartition(HttpRequest request, HttpResponder responder, DatasetId datasetId) {
Dataset dataset;
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
dataset = datasetInstantiator.getDataset(datasetId);
if (dataset == null) {
responder.sendString(HttpResponseStatus.NOT_FOUND, "Cannot load dataset " + datasetId);
return;
}
} catch (IOException e) {
String classNotFoundMessage = isClassNotFoundException(e);
if (classNotFoundMessage != null) {
JsonObject json = new JsonObject();
json.addProperty("handle", QueryHandle.NO_OP.getHandle());
responder.sendJson(HttpResponseStatus.OK, json);
return;
}
LOG.error("Exception instantiating dataset {}.", datasetId, e);
responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId);
return;
}
try {
if (!(dataset instanceof PartitionedFileSet)) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
return;
}
Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
Reader reader = new InputStreamReader(new ChannelBufferInputStream(request.getContent()));
Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
}.getType());
PartitionKey partitionKey;
try {
partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
} catch (Exception e) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
return;
}
if (partitionKey == null) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
return;
}
QueryHandle handle = exploreTableManager.dropPartition(datasetId, properties, partitionKey);
JsonObject json = new JsonObject();
json.addProperty("handle", handle.getHandle());
responder.sendJson(HttpResponseStatus.OK, json);
} catch (Throwable e) {
LOG.error("Got exception:", e);
responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
}
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMapReduce.
private void runDynamicPartitionerMapReduce(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, boolean expectedStatus) throws Exception {
ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
final long now = System.currentTimeMillis();
final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
// write values to the input kvTable
final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
// the keys are not used; it matters that they're unique though
for (int i = 0; i < records.size(); i++) {
kvTable.write(Integer.toString(i), records.get(i).toString());
}
}
});
String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
// run the partition writer m/r with this output partition time
ImmutableMap<String, String> arguments = ImmutableMap.of(OUTPUT_PARTITION_KEY, Long.toString(now), allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
long startTime = System.currentTimeMillis();
boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
Assert.assertEquals(expectedStatus, status);
if (!expectedStatus) {
// if we expect the program to fail, no need to check the output data for expected results
return;
}
// Verify notifications
List<Notification> notifications = getDataNotifications(startTime);
Assert.assertEquals(1, notifications.size());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
// this should have created a partition in the pfs
final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws IOException {
Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
for (PartitionDetail partition : pfs.getPartitions(null)) {
partitions.put(partition.getPartitionKey(), partition);
// check that the mapreduce wrote the output partition metadata to all the output partitions
Assert.assertEquals(AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.METADATA, partition.getMetadata().asMap());
}
Assert.assertEquals(3, partitions.size());
Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
// Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
PartitionDetail partitionDetail = partitionKeyEntry.getValue();
String relativePath = partitionDetail.getRelativePath();
int zip = (int) partitionKeyEntry.getKey().getField("zip");
Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
}
for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
}
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionRollbackTestRun method testPFSRollback.
/*
* This tests all the following cases:
*
* 1. addPartition(location) fails because partition already exists
* 2. addPartition(location) fails because Hive partition already exists
* 3. addPartition(location) succeeds but transaction fails
* 4. getPartitionOutput() fails because partition already exists
* 5. partitionOutput.addPartition() fails because Hive partition already exists
* 6. partitionOutput.addPartition() succeeds but transaction fails
* 7. mapreduce writing partition fails because location already exists
* 8. mapreduce writing partition fails because partition already exists
* 9. mapreduce writing partition fails because Hive partition already exists
* 10. mapreduce writing dynamic partition fails because location already exists
* 11. mapreduce writing dynamic partition fails because partition already exists
* 12. mapreduce writing dynamic partition fails because Hive partition already exists
* 13. multi-output mapreduce writing partition fails because location already exists
* 13a. first output fails, other output must rollback 0 and 5
* 13b. second output fails, first output must rollback 0 and 5
* 14. multi-output mapreduce writing partition fails because partition already exists
* 14a. first output fails, other output must rollback partition 5
* 14b. second output fails, first output must rollback partition 5
* 15. multi-output mapreduce writing partition fails because Hive partition already exists
* 15a. first output fails, other output must rollback partitions 0 and 5
* 15b. second output fails, first output must rollback partitions 0 and 5
*
* For all these cases, we validate that existing files and partitions are preserved, and newly
* added files and partitions are rolled back.
*/
@Test
public void testPFSRollback() throws Exception {
ApplicationManager appManager = deployApplication(AppWritingToPartitioned.class);
MapReduceManager mrManager = appManager.getMapReduceManager(MAPREDUCE);
int numRuns = 0;
Validator pfsValidator = new Validator(PFS);
Validator otherValidator = new Validator(OTHER);
final UnitTestManager.UnitTestDatasetManager<PartitionedFileSet> pfsManager = pfsValidator.getPfsManager();
final PartitionedFileSet pfs = pfsManager.get();
final PartitionedFileSet other = otherValidator.getPfsManager().get();
final String path3 = pfsValidator.getRelativePath3();
// 1. addPartition(location) fails because partition already exists
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
pfs.addPartition(KEY_1, path3);
}
});
Assert.fail("Expected tx to fail because partition for number=1 already exists");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
// 2. addPartition(location) fails because Hive partition already exists
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
pfs.addPartition(KEY_4, path3);
}
});
Assert.fail("Expected tx to fail because hive partition for number=1 already exists");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
// 3. addPartition(location) succeeds but transaction fails
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
pfs.addPartition(KEY_3, path3);
throw new RuntimeException("fail the tx");
}
});
Assert.fail("Expected tx to fail because it threw a runtime exception");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
// 4. partitionOutput.getPartitionOutput() fails because partition already exists
try {
pfs.getPartitionOutput(KEY_1);
Assert.fail("Expected getPartitionOutput to fail, because the partition already exists.");
} catch (DataSetException expected) {
}
pfsValidator.validate();
// 5. partitionOutput.addPartition() fails because Hive partition already exists
final PartitionOutput output4x = pfs.getPartitionOutput(KEY_4);
final Location location4x = output4x.getLocation();
try (Writer writer = new OutputStreamWriter(location4x.append("file").getOutputStream())) {
writer.write("4x,4x\n");
}
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
output4x.addPartition();
}
});
Assert.fail("Expected tx to fail because hive partition for number=4 already exists");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
Assert.assertFalse(location4x.exists());
// 6. partitionOutput.addPartition() succeeds but transaction fails
final PartitionOutput output5x = pfs.getPartitionOutput(KEY_5);
final Location location5x = output5x.getLocation();
try (Writer writer = new OutputStreamWriter(location5x.append("file").getOutputStream())) {
writer.write("5x,5x\n");
}
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
output5x.addPartition();
throw new RuntimeException("fail the tx");
}
});
Assert.fail("Expected tx to fail because it threw a runtime exception");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
Assert.assertFalse(location5x.exists());
// 7. mapreduce writing partition fails because location already exists
mrManager.start(ImmutableMap.of(PFS_OUT, "1", "input.text", "1x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
// 8. mapreduce writing partition fails because partition already exists
mrManager.start(ImmutableMap.of(PFS_OUT, "2", "input.text", "2x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_2).getLocation().exists());
// 9. mapreduce writing partition fails because Hive partition already exists
mrManager.start(ImmutableMap.of(PFS_OUT, "4", "input.text", "4x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
// 10. mapreduce writing dynamic partition fails because location already exists
mrManager.start(ImmutableMap.of("input.text", "3x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 11. mapreduce writing dynamic partition fails because partition already exists
mrManager.start(ImmutableMap.of("input.text", "2x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_2).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 12. mapreduce writing dynamic partition fails because Hive partition already exists
mrManager.start(ImmutableMap.of("input.text", "0x 4x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 13. multi-output mapreduce writing partition fails because location already exists
// 13a. first output fails, other output must rollback 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "1", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(other.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
// 13b. second output fails, first output must rollback 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, OTHER_OUT, "1", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 14. multi-output mapreduce writing partition fails because partition already exists
// 14a. first output fails, other output must rollback partition 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "2", OTHER_OUT, "5", "input.text", "2x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
// 14b. second output fails, first output must rollback partition 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "5", OTHER_OUT, "2", "input.text", "2x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 15. multi-output mapreduce writing partition fails because Hive partition already exists
// 15a. first output fails, other output must rollback partitions 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "4", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
Assert.assertFalse(other.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
// 15b. second output fails, first output must rollback partitions 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, OTHER_OUT, "4", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(other.getPartitionOutput(KEY_4).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.
private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
pfsManager.flush();
Map<String, String> inputArgs = new HashMap<>();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
Map<String, String> outputArgs = new HashMap<>();
PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
args.put("input", "pfs");
args.put("output", "pfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
pfsManager.flush();
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
// Cleanup after test completed
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());
pfsManager.flush();
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class DynamicPartitioningTestRun method validateFiles.
private void validateFiles(String dataset, Location expectedExisting) throws Exception {
DataSetManager<PartitionedFileSet> pfs = getDataset(testSpace.dataset(dataset));
Location base = pfs.get().getEmbeddedFileSet().getBaseLocation();
validateFiles(base, expectedExisting);
}
Aggregations