use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class MapReduceWithMultipleInputsTest method testSimpleJoin.
@Test
public void testSimpleJoin() throws Exception {
ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingMultipleInputs.class);
final FileSet fileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleInputs.PURCHASES);
Location inputFile = fileSet.getBaseLocation().append("inputFile");
inputFile.createNew();
PrintWriter writer = new PrintWriter(inputFile.getOutputStream());
// the PURCHASES dataset consists of purchase records in the format: <customerId> <spend>
writer.println("1 20");
writer.println("1 25");
writer.println("1 30");
writer.println("2 5");
writer.close();
// write some of the purchases to the stream
writeToStream(AppWithMapReduceUsingMultipleInputs.PURCHASES, "2 13");
writeToStream(AppWithMapReduceUsingMultipleInputs.PURCHASES, "3 60");
FileSet fileSet2 = datasetCache.getDataset(AppWithMapReduceUsingMultipleInputs.CUSTOMERS);
inputFile = fileSet2.getBaseLocation().append("inputFile");
inputFile.createNew();
// the CUSTOMERS dataset consists of records in the format: <customerId> <customerName>
writer = new PrintWriter(inputFile.getOutputStream());
writer.println("1 Bob");
writer.println("2 Samuel");
writer.println("3 Joe");
writer.close();
// Using multiple inputs, this MapReduce will join on the two above datasets to get aggregate results.
// The records are expected to be in the form: <customerId> <customerName> <totalSpend>
runProgram(app, AppWithMapReduceUsingMultipleInputs.ComputeSum.class, new BasicArguments());
FileSet outputFileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleInputs.OUTPUT_DATASET);
// will only be 1 part file, due to the small amount of data
Location outputLocation = outputFileSet.getBaseLocation().append("output").append("part-r-00000");
List<String> lines = CharStreams.readLines(CharStreams.newReaderSupplier(Locations.newInputSupplier(outputLocation), Charsets.UTF_8));
Assert.assertEquals(ImmutableList.of("1 Bob 75", "2 Samuel 18", "3 Joe 60"), lines);
// assert that the mapper was initialized and destroyed (this doesn't happen when using hadoop's MultipleOutputs).
Assert.assertEquals("true", System.getProperty("mapper.initialized"));
Assert.assertEquals("true", System.getProperty("mapper.destroyed"));
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TimePartitionedFileSetDefinition method getDataset.
@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
// make any necessary updates to the arguments
arguments = updateArgumentsIfNeeded(arguments);
FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
return new TimePartitionedFileSetDataset(datasetContext, spec.getName(), fileset, table, spec, arguments, getExploreProvider());
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class PartitionedFileSetDefinition method getDataset.
@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
// properties must contain the partitioning
Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
// make any necessary updates to the arguments
arguments = updateArgumentsIfNeeded(arguments, partitioning);
FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class FileSetTest method testRollback.
@Test
public void testRollback() throws IOException, TransactionFailureException, DatasetManagementException {
// test deletion of an empty output directory
FileSet fileSet1 = createFileset(testFileSetInstance1);
Location outputLocation = fileSet1.getOutputLocation();
Assert.assertFalse(outputLocation.exists());
Assert.assertTrue(outputLocation.mkdirs());
Assert.assertTrue(outputLocation.exists());
((FileSetDataset) fileSet1).onFailure();
Assert.assertFalse(outputLocation.exists());
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class FileSetTest method testExternalAbsolutePath.
@Test
public void testExternalAbsolutePath() throws IOException, DatasetManagementException {
// create an external dir and create a file in it
String absolutePath = tmpFolder.newFolder() + "/absolute/path";
File absoluteFile = new File(absolutePath);
absoluteFile.mkdirs();
File someFile = new File(absoluteFile, "some.file");
someFile.createNewFile();
// create an external dataset
dsFrameworkUtil.createInstance("fileSet", testFileSetInstance5, FileSetProperties.builder().setBasePath(absolutePath).setDataExternal(true).build());
// instantiate the file set with an input and output path
Map<String, String> fileArgs = Maps.newHashMap();
FileSetArguments.setInputPath(fileArgs, "some.file");
FileSetArguments.setOutputPath(fileArgs, "out");
FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs);
Assert.assertNotNull(fileSet);
// read the existing file
Location input = fileSet.getInputLocations().iterator().next();
InputStream in = input.getInputStream();
in.close();
// attempt to write an output file
try {
fileSet.getOutputLocation();
Assert.fail("Extrernal file set should not allow writing output.");
} catch (UnsupportedOperationException e) {
// expected
}
// delete the dataset and validate that the files are still there
dsFrameworkUtil.deleteInstance(testFileSetInstance5);
Assert.assertTrue(someFile.exists());
}
Aggregations