use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetTest method testRollbackOnTransactionAbort.
@Test
public void testRollbackOnTransactionAbort() throws Exception {
PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsInstance);
TransactionContext txContext = new TransactionContext(txClient, (TransactionAware) pfs);
txContext.start();
Location outputLocation = createPartition(pfs, PARTITION_KEY, "file");
;
Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
Assert.assertTrue(pfs.getPartition(PARTITION_KEY).getLocation().exists());
txContext.abort();
// because the previous transaction aborted, the partition as well as the file will not exist
txContext.start();
Assert.assertNull(pfs.getPartition(PARTITION_KEY));
Assert.assertFalse(outputLocation.exists());
txContext.finish();
}
use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class DynamicPartitioningTestRun method testDynamicPartitioningMRWithFailure.
private void testDynamicPartitioningMRWithFailure(ApplicationManager appManager, String dsWithExistingPartition, String... outputs) throws Exception {
// set up the output datasets
String outputArg = "";
for (String dataset : outputs) {
outputArg += dataset + " ";
try {
deleteDatasetInstance(testSpace.dataset(dataset));
} catch (InstanceNotFoundException e) {
// may be expected. I wish the test framework had truncate()
}
addDatasetInstance(PartitionedFileSet.class.getName(), testSpace.dataset(dataset), PartitionedFileSetProperties.builder().setPartitioning(PARTITIONING).setEnableExploreOnCreate(true).setOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class).setOutputProperty(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.SEPERATOR, ",").setExploreFormat("csv").setExploreSchema("key string, value string").build());
}
outputArg = outputArg.trim();
// create partition (x="1") in one of the outputs
DataSetManager<PartitionedFileSet> pfs = getDataset(testSpace.dataset(dsWithExistingPartition));
Location loc = pfs.get().getEmbeddedFileSet().getLocation("some/path");
OutputStream os = loc.append("part1").getOutputStream();
try (Writer writer = new OutputStreamWriter(os)) {
writer.write("1,x\n");
}
pfs.get().addPartition(PartitionKey.builder().addStringField("x", "1").build(), "some/path");
pfs.flush();
validatePartitions(dsWithExistingPartition, true);
Map<String, String> arguments = ImmutableMap.of("outputs", outputArg);
MapReduceManager mrManager = appManager.getMapReduceManager("DynamicPartitioningMR");
int numRuns = mrManager.getHistory(ProgramRunStatus.FAILED).size();
mrManager.start(arguments);
Tasks.waitFor(numRuns + 1, () -> mrManager.getHistory(ProgramRunStatus.FAILED).size(), 300, TimeUnit.SECONDS);
for (String dataset : outputs) {
validatePartitions(dataset, dataset.equals(dsWithExistingPartition));
validateFiles(dataset, dataset.equals(dsWithExistingPartition) ? loc : null);
}
}
use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.
private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
pfsManager.flush();
Map<String, String> inputArgs = new HashMap<>();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
Map<String, String> outputArgs = new HashMap<>();
PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
args.put("input", "pfs");
args.put("output", "pfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
pfsManager.flush();
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
// Cleanup after test completed
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());
pfsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumingTestRun method testWordCountOnFileSet.
private void testWordCountOnFileSet(Function<ApplicationManager, ProgramManager> runProgram, boolean produceOutputPartitionEachRun) throws Exception {
ApplicationManager applicationManager = deployApplication(AppWithPartitionConsumers.class);
ServiceManager serviceManager = applicationManager.getServiceManager("DatasetService").start();
serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
URL serviceURL = serviceManager.getServiceURL();
// write a file to the file set using the service and run the WordCount MapReduce job on that one partition
createPartition(serviceURL, LINE1, "1");
ProgramManager programManager = runProgram.apply(applicationManager);
programManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(2), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(1), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(0), getCount(serviceURL, "c"));
// create two additional partitions
createPartition(serviceURL, LINE2, "2");
createPartition(serviceURL, LINE3, "3");
// running the program job now processes these two new partitions (LINE2 and LINE3) and updates the counts
// dataset accordingly
programManager = runProgram.apply(applicationManager);
programManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
// running the program without adding new partitions does not affect the counts dataset
programManager = runProgram.apply(applicationManager);
programManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
DataSetManager<PartitionedFileSet> outputLines = getDataset("outputLines");
Set<PartitionDetail> partitions = outputLines.get().getPartitions(PartitionFilter.ALWAYS_MATCH);
// each of the three MapReduce runs produces an output partition (even if there's no input data)
// however, Worker run doesn't produce a new output partition if there's no new input partition
Assert.assertEquals(produceOutputPartitionEachRun ? 3 : 2, partitions.size());
// we only store the counts to the "outputLines" dataset
List<String> expectedCounts = Lists.newArrayList("1", "1", "2", "2", "3");
List<String> outputRecords = getDataFromExplore("outputLines");
Collections.sort(outputRecords);
Assert.assertEquals(expectedCounts, outputRecords);
}
use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConcatenateTest method testConcatenate.
/**
* 1. Write 100 small files (orc format) to a Partition of a PartitionedFileSet.
* 2. Execute a partition concatenate operation.
* 3. As compared to before the concatenate operation, validate that the number of files is reduced, while
* the contents of the files remains the same.
*/
@Test
public void testConcatenate() throws Exception {
String orcPFS = "orcPFS";
addDatasetInstance(PartitionedFileSet.class.getName(), orcPFS, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addLongField("time").build()).setOutputFormat(OrcNewOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("record STRING").build());
// 1. create 100 small files in the input FileSet
DataSetManager<PartitionedFileSet> cleanRecordsManager = getDataset(orcPFS);
PartitionedFileSet cleanRecords = cleanRecordsManager.get();
PartitionKey outputPartition = PartitionKey.builder().addLongField("time", 5000).build();
PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
Location partitionLocation = partitionOutput.getLocation();
int numInputFiles = 100;
List<String> writtenData = writeSmallOrcFiles(partitionLocation, numInputFiles);
partitionOutput.addPartition();
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
// this is a timestamp before concatenating, but after writing the files
long beforeConcatTime = System.currentTimeMillis();
List<Location> dataFiles = listFilteredChildren(partitionLocation);
// each input file will result in one output file, due to the FileInputFormat class and FileOutputFormat class
// being used
Assert.assertEquals(numInputFiles, dataFiles.size());
for (Location dataFile : dataFiles) {
// all the files should have a lastModified smaller than now
Assert.assertTrue(dataFile.lastModified() < beforeConcatTime);
}
// 2. run the concatenate operation
cleanRecords.concatenatePartition(outputPartition).get();
// 3. check that the data files' lastModified timestamp is updated, and there should be fewer of them
dataFiles = listFilteredChildren(partitionLocation);
Assert.assertTrue(dataFiles.size() < numInputFiles);
// should have a lastModified larger than now
Assert.assertTrue(Iterables.getOnlyElement(dataFiles).lastModified() > beforeConcatTime);
// even though the files were concatenated, the explore results should be unchanged
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
}
Aggregations