Search in sources :

Example 6 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class MapReduceWithMultipleInputsTest method testSimpleJoin.

@Test
public void testSimpleJoin() throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingMultipleInputs.class);
    final FileSet fileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleInputs.PURCHASES);
    Location inputFile = fileSet.getBaseLocation().append("inputFile");
    inputFile.createNew();
    PrintWriter writer = new PrintWriter(inputFile.getOutputStream());
    // the PURCHASES dataset consists of purchase records in the format: <customerId> <spend>
    writer.println("1 20");
    writer.println("1 25");
    writer.println("1 30");
    writer.println("2 5");
    writer.close();
    // write some of the purchases to the stream
    writeToStream(AppWithMapReduceUsingMultipleInputs.PURCHASES, "2 13");
    writeToStream(AppWithMapReduceUsingMultipleInputs.PURCHASES, "3 60");
    FileSet fileSet2 = datasetCache.getDataset(AppWithMapReduceUsingMultipleInputs.CUSTOMERS);
    inputFile = fileSet2.getBaseLocation().append("inputFile");
    inputFile.createNew();
    // the CUSTOMERS dataset consists of records in the format: <customerId> <customerName>
    writer = new PrintWriter(inputFile.getOutputStream());
    writer.println("1 Bob");
    writer.println("2 Samuel");
    writer.println("3 Joe");
    writer.close();
    // Using multiple inputs, this MapReduce will join on the two above datasets to get aggregate results.
    // The records are expected to be in the form: <customerId> <customerName> <totalSpend>
    runProgram(app, AppWithMapReduceUsingMultipleInputs.ComputeSum.class, new BasicArguments());
    FileSet outputFileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleInputs.OUTPUT_DATASET);
    // will only be 1 part file, due to the small amount of data
    Location outputLocation = outputFileSet.getBaseLocation().append("output").append("part-r-00000");
    List<String> lines = CharStreams.readLines(CharStreams.newReaderSupplier(Locations.newInputSupplier(outputLocation), Charsets.UTF_8));
    Assert.assertEquals(ImmutableList.of("1 Bob 75", "2 Samuel 18", "3 Joe 60"), lines);
    // assert that the mapper was initialized and destroyed (this doesn't happen when using hadoop's MultipleOutputs).
    Assert.assertEquals("true", System.getProperty("mapper.initialized"));
    Assert.assertEquals("true", System.getProperty("mapper.destroyed"));
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) Location(org.apache.twill.filesystem.Location) PrintWriter(java.io.PrintWriter) Test(org.junit.Test)

Example 7 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TimePartitionedFileSetDefinition method getDataset.

@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
    // make any necessary updates to the arguments
    arguments = updateArgumentsIfNeeded(arguments);
    FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
    IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
    return new TimePartitionedFileSetDataset(datasetContext, spec.getName(), fileset, table, spec, arguments, getExploreProvider());
}
Also used : PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) IndexedTable(co.cask.cdap.api.dataset.lib.IndexedTable)

Example 8 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class PartitionedFileSetDefinition method getDataset.

@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
    // properties must contain the partitioning
    Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
    // make any necessary updates to the arguments
    arguments = updateArgumentsIfNeeded(arguments, partitioning);
    FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
    IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
    return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}
Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IndexedTable(co.cask.cdap.api.dataset.lib.IndexedTable)

Example 9 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetTest method testRollback.

@Test
public void testRollback() throws IOException, TransactionFailureException, DatasetManagementException {
    // test deletion of an empty output directory
    FileSet fileSet1 = createFileset(testFileSetInstance1);
    Location outputLocation = fileSet1.getOutputLocation();
    Assert.assertFalse(outputLocation.exists());
    Assert.assertTrue(outputLocation.mkdirs());
    Assert.assertTrue(outputLocation.exists());
    ((FileSetDataset) fileSet1).onFailure();
    Assert.assertFalse(outputLocation.exists());
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) FileSetDataset(co.cask.cdap.data2.dataset2.lib.file.FileSetDataset) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 10 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetTest method testExternalAbsolutePath.

@Test
public void testExternalAbsolutePath() throws IOException, DatasetManagementException {
    // create an external dir and create a file in it
    String absolutePath = tmpFolder.newFolder() + "/absolute/path";
    File absoluteFile = new File(absolutePath);
    absoluteFile.mkdirs();
    File someFile = new File(absoluteFile, "some.file");
    someFile.createNewFile();
    // create an external dataset
    dsFrameworkUtil.createInstance("fileSet", testFileSetInstance5, FileSetProperties.builder().setBasePath(absolutePath).setDataExternal(true).build());
    // instantiate the file set with an input and output path
    Map<String, String> fileArgs = Maps.newHashMap();
    FileSetArguments.setInputPath(fileArgs, "some.file");
    FileSetArguments.setOutputPath(fileArgs, "out");
    FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs);
    Assert.assertNotNull(fileSet);
    // read the existing file
    Location input = fileSet.getInputLocations().iterator().next();
    InputStream in = input.getInputStream();
    in.close();
    // attempt to write an output file
    try {
        fileSet.getOutputLocation();
        Assert.fail("Extrernal file set should not allow writing output.");
    } catch (UnsupportedOperationException e) {
    // expected
    }
    // delete the dataset and validate that the files are still there
    dsFrameworkUtil.deleteInstance(testFileSetInstance5);
    Assert.assertTrue(someFile.exists());
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) InputStream(java.io.InputStream) File(java.io.File) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Aggregations

FileSet (co.cask.cdap.api.dataset.lib.FileSet)40 Location (org.apache.twill.filesystem.Location)28 Test (org.junit.Test)19 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)12 HashMap (java.util.HashMap)11 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)8 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 ApplicationManager (co.cask.cdap.test.ApplicationManager)8 DatasetId (co.cask.cdap.proto.id.DatasetId)7 ColumnDesc (co.cask.cdap.proto.ColumnDesc)6 QueryResult (co.cask.cdap.proto.QueryResult)6 SparkManager (co.cask.cdap.test.SparkManager)5 WorkflowManager (co.cask.cdap.test.WorkflowManager)5 Table (co.cask.cdap.api.dataset.table.Table)4 ServiceManager (co.cask.cdap.test.ServiceManager)4 IOException (java.io.IOException)4 Map (java.util.Map)4 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3 PartitionedFileSetProperties (co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties)3 FileSetDataset (co.cask.cdap.data2.dataset2.lib.file.FileSetDataset)3