use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class PartitionedFileSetTest method testAddRemoveGetPartitions.
@Test
@Category(SlowTests.class)
public void testAddRemoveGetPartitions() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final PartitionKey[][][] keys = new PartitionKey[4][4][4];
final String[][][] paths = new String[4][4][4];
final Set<BasicPartition> allPartitionDetails = Sets.newHashSet();
// add a bunch of partitions
for (int s = 0; s < 4; s++) {
for (int i = 0; i < 4; i++) {
for (int l = 0; l < 4; l++) {
final PartitionKey key = PartitionKey.builder().addField("s", String.format("%c-%d", 'a' + s, s)).addField("i", i * 100).addField("l", 15L - 10 * l).build();
BasicPartition basicPartition = dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new Callable<BasicPartition>() {
@Override
public BasicPartition call() throws Exception {
PartitionOutput p = dataset.getPartitionOutput(key);
p.addPartition();
return new BasicPartition((PartitionedFileSetDataset) dataset, p.getRelativePath(), p.getPartitionKey());
}
});
keys[s][i][l] = key;
paths[s][i][l] = basicPartition.getRelativePath();
allPartitionDetails.add(basicPartition);
}
}
}
// validate getPartition with exact partition key
for (int s = 0; s < 4; s++) {
for (int i = 0; i < 4; i++) {
for (int l = 0; l < 4; l++) {
final PartitionKey key = keys[s][i][l];
final String path = paths[s][i][l];
dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
PartitionDetail partitionDetail = dataset.getPartition(key);
Assert.assertNotNull(partitionDetail);
Assert.assertEquals(path, partitionDetail.getRelativePath());
}
});
// also test getPartitionPaths() and getPartitions() for the filter matching this
@SuppressWarnings({ "unchecked", "unused" }) boolean success = testFilter(dataset, allPartitionDetails, PartitionFilter.builder().addValueCondition("l", key.getField("l")).addValueCondition("s", key.getField("s")).addValueCondition("i", key.getField("i")).build());
}
}
}
// test whether query works without filter
testFilter(dataset, allPartitionDetails, null);
// generate an list of partition filters with exhaustive coverage
List<PartitionFilter> filters = generateFilters();
// test all kinds of filters
testAllFilters(dataset, allPartitionDetails, filters);
// remove a few of the partitions and test again, repeatedly
PartitionKey[] keysToRemove = { keys[1][2][3], keys[0][1][0], keys[2][3][2], keys[3][1][2] };
for (final PartitionKey key : keysToRemove) {
// remove in a transaction
dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Procedure<PartitionKey>() {
@Override
public void apply(PartitionKey partitionKey) throws Exception {
dataset.dropPartition(partitionKey);
}
}, key);
// test all filters
BasicPartition toRemove = Iterables.tryFind(allPartitionDetails, new com.google.common.base.Predicate<BasicPartition>() {
@Override
public boolean apply(BasicPartition partition) {
return key.equals(partition.getPartitionKey());
}
}).get();
allPartitionDetails.remove(toRemove);
testAllFilters(dataset, allPartitionDetails, filters);
}
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class PartitionedFileSetTest method testPartitionCreationTime.
@Test
public void testPartitionCreationTime() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
PartitionOutput partitionOutput = dataset.getPartitionOutput(PARTITION_KEY);
long beforeTime = System.currentTimeMillis();
partitionOutput.addPartition();
long afterTime = System.currentTimeMillis();
PartitionDetail partitionDetail = dataset.getPartition(PARTITION_KEY);
Assert.assertNotNull(partitionDetail);
long creationTime = partitionDetail.getMetadata().getCreationTime();
long lastModificationTime = partitionDetail.getMetadata().lastModificationTime();
// lastModificationTime time should be equal to creationTime for a partition that has not been appended to
Assert.assertEquals(creationTime, lastModificationTime);
Assert.assertTrue(creationTime >= beforeTime && creationTime <= afterTime);
}
});
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class PartitionedFileSetTest method testUpdateMetadata.
@Test
public void testUpdateMetadata() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
PartitionOutput partitionOutput = dataset.getPartitionOutput(PARTITION_KEY);
ImmutableMap<String, String> originalEntries = ImmutableMap.of("key1", "value1", "key2", "value2");
partitionOutput.setMetadata(originalEntries);
partitionOutput.addPartition();
ImmutableMap<String, String> updatedMetadata = ImmutableMap.of("key3", "value3");
dataset.addMetadata(PARTITION_KEY, updatedMetadata);
PartitionDetail partitionDetail = dataset.getPartition(PARTITION_KEY);
Assert.assertNotNull(partitionDetail);
HashMap<String, String> combinedEntries = Maps.newHashMap();
combinedEntries.putAll(originalEntries);
combinedEntries.putAll(updatedMetadata);
Assert.assertEquals(combinedEntries, partitionDetail.getMetadata().asMap());
// using the setMetadata API, adding an entry, for a key that already exists will overwrite the previous value
dataset.setMetadata(PARTITION_KEY, Collections.singletonMap("key3", "value4"));
partitionDetail = dataset.getPartition(PARTITION_KEY);
Assert.assertNotNull(partitionDetail);
Assert.assertEquals(ImmutableMap.of("key1", "value1", "key2", "value2", "key3", "value4"), partitionDetail.getMetadata().asMap());
// adding an entry, for a key that already exists will throw an Exception
try {
dataset.addMetadata(PARTITION_KEY, "key2", "value3");
Assert.fail("Expected not to be able to update an existing metadata entry");
} catch (DataSetException expected) {
}
// possible to remove multiple metadata entries; if a key doesn't exist, no error is thrown
dataset.removeMetadata(PARTITION_KEY, ImmutableSet.of("key2", "key3", "key4"));
// key2 and key3 were removed
partitionDetail = dataset.getPartition(PARTITION_KEY);
Assert.assertNotNull(partitionDetail);
Assert.assertEquals(ImmutableMap.of("key1", "value1"), partitionDetail.getMetadata().asMap());
try {
// adding an entry, for a key that already exists will throw an Exception
PartitionKey nonexistentPartitionKey = PartitionKey.builder().addIntField("i", 42).addLongField("l", 17L).addStringField("s", "nonexistent").build();
dataset.addMetadata(nonexistentPartitionKey, "key2", "value3");
Assert.fail("Expected not to be able to add metadata for a nonexistent partition");
} catch (DataSetException expected) {
}
}
});
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class PartitionRollbackTestRun method testPFSRollback.
/*
* This tests all the following cases:
*
* 1. addPartition(location) fails because partition already exists
* 2. addPartition(location) fails because Hive partition already exists
* 3. addPartition(location) succeeds but transaction fails
* 4. getPartitionOutput() fails because partition already exists
* 5. partitionOutput.addPartition() fails because Hive partition already exists
* 6. partitionOutput.addPartition() succeeds but transaction fails
* 7. mapreduce writing partition fails because location already exists
* 8. mapreduce writing partition fails because partition already exists
* 9. mapreduce writing partition fails because Hive partition already exists
* 10. mapreduce writing dynamic partition fails because location already exists
* 11. mapreduce writing dynamic partition fails because partition already exists
* 12. mapreduce writing dynamic partition fails because Hive partition already exists
* 13. multi-output mapreduce writing partition fails because location already exists
* 13a. first output fails, other output must rollback 0 and 5
* 13b. second output fails, first output must rollback 0 and 5
* 14. multi-output mapreduce writing partition fails because partition already exists
* 14a. first output fails, other output must rollback partition 5
* 14b. second output fails, first output must rollback partition 5
* 15. multi-output mapreduce writing partition fails because Hive partition already exists
* 15a. first output fails, other output must rollback partitions 0 and 5
* 15b. second output fails, first output must rollback partitions 0 and 5
*
* For all these cases, we validate that existing files and partitions are preserved, and newly
* added files and partitions are rolled back.
*/
@Test
public void testPFSRollback() throws Exception {
ApplicationManager appManager = deployApplication(AppWritingToPartitioned.class);
MapReduceManager mrManager = appManager.getMapReduceManager(MAPREDUCE);
int numRuns = 0;
Validator pfsValidator = new Validator(PFS);
Validator otherValidator = new Validator(OTHER);
final UnitTestManager.UnitTestDatasetManager<PartitionedFileSet> pfsManager = pfsValidator.getPfsManager();
final PartitionedFileSet pfs = pfsManager.get();
final PartitionedFileSet other = otherValidator.getPfsManager().get();
final String path3 = pfsValidator.getRelativePath3();
// 1. addPartition(location) fails because partition already exists
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
pfs.addPartition(KEY_1, path3);
}
});
Assert.fail("Expected tx to fail because partition for number=1 already exists");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
// 2. addPartition(location) fails because Hive partition already exists
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
pfs.addPartition(KEY_4, path3);
}
});
Assert.fail("Expected tx to fail because hive partition for number=1 already exists");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
// 3. addPartition(location) succeeds but transaction fails
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
pfs.addPartition(KEY_3, path3);
throw new RuntimeException("fail the tx");
}
});
Assert.fail("Expected tx to fail because it threw a runtime exception");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
// 4. partitionOutput.getPartitionOutput() fails because partition already exists
try {
pfs.getPartitionOutput(KEY_1);
Assert.fail("Expected getPartitionOutput to fail, because the partition already exists.");
} catch (DataSetException expected) {
}
pfsValidator.validate();
// 5. partitionOutput.addPartition() fails because Hive partition already exists
final PartitionOutput output4x = pfs.getPartitionOutput(KEY_4);
final Location location4x = output4x.getLocation();
try (Writer writer = new OutputStreamWriter(location4x.append("file").getOutputStream())) {
writer.write("4x,4x\n");
}
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
output4x.addPartition();
}
});
Assert.fail("Expected tx to fail because hive partition for number=4 already exists");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
Assert.assertFalse(location4x.exists());
// 6. partitionOutput.addPartition() succeeds but transaction fails
final PartitionOutput output5x = pfs.getPartitionOutput(KEY_5);
final Location location5x = output5x.getLocation();
try (Writer writer = new OutputStreamWriter(location5x.append("file").getOutputStream())) {
writer.write("5x,5x\n");
}
try {
pfsManager.execute(new Runnable() {
@Override
public void run() {
output5x.addPartition();
throw new RuntimeException("fail the tx");
}
});
Assert.fail("Expected tx to fail because it threw a runtime exception");
} catch (TransactionFailureException e) {
// expected
}
pfsValidator.validate();
Assert.assertFalse(location5x.exists());
// 7. mapreduce writing partition fails because location already exists
mrManager.start(ImmutableMap.of(PFS_OUT, "1", "input.text", "1x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
// 8. mapreduce writing partition fails because partition already exists
mrManager.start(ImmutableMap.of(PFS_OUT, "2", "input.text", "2x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_2).getLocation().exists());
// 9. mapreduce writing partition fails because Hive partition already exists
mrManager.start(ImmutableMap.of(PFS_OUT, "4", "input.text", "4x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
// 10. mapreduce writing dynamic partition fails because location already exists
mrManager.start(ImmutableMap.of("input.text", "3x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 11. mapreduce writing dynamic partition fails because partition already exists
mrManager.start(ImmutableMap.of("input.text", "2x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_2).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 12. mapreduce writing dynamic partition fails because Hive partition already exists
mrManager.start(ImmutableMap.of("input.text", "0x 4x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 13. multi-output mapreduce writing partition fails because location already exists
// 13a. first output fails, other output must rollback 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "1", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(other.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
// 13b. second output fails, first output must rollback 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, OTHER_OUT, "1", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 14. multi-output mapreduce writing partition fails because partition already exists
// 14a. first output fails, other output must rollback partition 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "2", OTHER_OUT, "5", "input.text", "2x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
// 14b. second output fails, first output must rollback partition 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "5", OTHER_OUT, "2", "input.text", "2x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
// 15. multi-output mapreduce writing partition fails because Hive partition already exists
// 15a. first output fails, other output must rollback partitions 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "4", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
Assert.assertFalse(other.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
// 15b. second output fails, first output must rollback partitions 0 and 5
mrManager.start(ImmutableMap.of("output.datasets", BOTH, OTHER_OUT, "4", "input.text", "0x 5x"));
mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
pfsValidator.validate();
otherValidator.validate();
Assert.assertFalse(other.getPartitionOutput(KEY_4).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class SparkFileSetTestRun method addTimePartition.
private void addTimePartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long inputTime) throws IOException, TransactionFailureException, InterruptedException {
TimePartitionedFileSet tpfs = tpfsManager.get();
PartitionOutput partitionOutput = tpfs.getPartitionOutput(inputTime);
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
tpfsManager.flush();
}
Aggregations