Search in sources :

Example 11 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class PartitionedFileSetDefinition method getDataset.

@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
    // properties must contain the partitioning
    Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
    // make any necessary updates to the arguments
    arguments = updateArgumentsIfNeeded(arguments, partitioning);
    FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
    IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
    return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}
Also used : Partitioning(io.cdap.cdap.api.dataset.lib.Partitioning) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) IndexedTable(io.cdap.cdap.api.dataset.lib.IndexedTable)

Example 12 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class PartitionedFileSetTest method testDefaultBasePath.

@Test
public void testDefaultBasePath() throws Exception {
    DatasetId id = DatasetFrameworkTestUtil.NAMESPACE_ID.dataset("testDefaultPath");
    dsFrameworkUtil.createInstance("partitionedFileSet", id, PartitionedFileSetProperties.builder().setPartitioning(PARTITIONING_1).build());
    PartitionedFileSet pfs = dsFrameworkUtil.getInstance(id);
    Location baseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
    Assert.assertEquals(baseLocation.getName(), id.getDataset());
    Assert.assertTrue(baseLocation.exists());
    Assert.assertTrue(baseLocation.isDirectory());
    DatasetId fid = DatasetFrameworkTestUtil.NAMESPACE_ID.dataset("testDefaultPathFileSet");
    dsFrameworkUtil.createInstance("fileSet", fid, FileSetProperties.builder().build());
    FileSet fs = dsFrameworkUtil.getInstance(fid);
    Location fsBaseLocation = fs.getBaseLocation();
    Assert.assertEquals(Locations.getParent(baseLocation), Locations.getParent(fsBaseLocation));
    dsFrameworkUtil.deleteInstance(fid);
    dsFrameworkUtil.deleteInstance(id);
    Assert.assertFalse(baseLocation.exists());
}
Also used : PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) DatasetId(io.cdap.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 13 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetTest method testRollbackOfNonDirectoryOutput.

@Test
public void testRollbackOfNonDirectoryOutput() throws IOException, TransactionFailureException, DatasetManagementException, UnauthorizedException {
    // test deletion of an output location, pointing to a non-directory file
    FileSet fileSet1 = createFileset(testFileSetInstance1);
    Location outputFile = fileSet1.getOutputLocation();
    Assert.assertFalse(outputFile.exists());
    outputFile.getOutputStream().close();
    Assert.assertTrue(outputFile.exists());
    ((FileSetDataset) fileSet1).onFailure();
    // the output file should still not be deleted
    Assert.assertTrue(outputFile.exists());
}
Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) FileSetDataset(io.cdap.cdap.data2.dataset2.lib.file.FileSetDataset) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 14 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetTest method testInputOutputFormatClassAtRuntime.

@Test
public void testInputOutputFormatClassAtRuntime() throws Exception {
    // create a dataset with text input and output formats
    DatasetId datasetId = OTHER_NAMESPACE.dataset("testRuntimeFormats");
    dsFrameworkUtil.createInstance("fileSet", datasetId, FileSetProperties.builder().setInputFormat(TextInputFormat.class).setOutputFormat(TextOutputFormat.class).build());
    // without passing anything in arguments, the input/output format classes will come from dataset properties
    FileSet fs = dsFrameworkUtil.getInstance(datasetId);
    Assert.assertEquals(TextInputFormat.class.getName(), fs.getInputFormatClassName());
    Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
    // allow overriding the input format in dataset runtime args
    fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName()));
    Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
    Assert.assertEquals(TextOutputFormat.class.getName(), fs.getOutputFormatClassName());
    // allow overriding both the input and output format in dataset runtime args
    fs = dsFrameworkUtil.getInstance(datasetId, ImmutableMap.of(FileSetProperties.INPUT_FORMAT, CombineTextInputFormat.class.getName(), FileSetProperties.OUTPUT_FORMAT, NullOutputFormat.class.getName()));
    Assert.assertEquals(CombineTextInputFormat.class.getName(), fs.getInputFormatClassName());
    Assert.assertEquals(NullOutputFormat.class.getName(), fs.getOutputFormatClassName());
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) NullOutputFormat(org.apache.hadoop.mapreduce.lib.output.NullOutputFormat) DatasetId(io.cdap.cdap.proto.id.DatasetId) Test(org.junit.Test)

Example 15 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetTest method testWriteRead.

@Test
public void testWriteRead() throws IOException, DatasetManagementException, UnauthorizedException {
    FileSet fileSet1 = createFileset(testFileSetInstance1);
    FileSet fileSet2 = createFileset(testFileSetInstance2);
    Location fileSet1Output = fileSet1.getOutputLocation();
    Location fileSet2Output = fileSet2.getOutputLocation();
    Location fileSet1NsDir = Locations.getParent(Locations.getParent(Locations.getParent(fileSet1Output)));
    Location fileSet2NsDir = Locations.getParent(Locations.getParent(Locations.getParent(fileSet2Output)));
    Assert.assertNotNull(fileSet1NsDir);
    Assert.assertNotNull(fileSet2NsDir);
    Assert.assertEquals(fileSet1NsDir.getName(), DatasetFrameworkTestUtil.NAMESPACE_ID.getNamespace());
    Assert.assertEquals(fileSet2NsDir.getName(), OTHER_NAMESPACE.getNamespace());
    Assert.assertNotEquals(fileSet1.getInputLocations().get(0).toURI().getPath(), fileSet2.getInputLocations().get(0).toURI().getPath());
    Assert.assertNotEquals(fileSet1Output.toURI().getPath(), fileSet2Output.toURI().getPath());
    try (OutputStream out = fileSet1.getOutputLocation().getOutputStream()) {
        out.write(42);
    }
    try (OutputStream out = fileSet2.getOutputLocation().getOutputStream()) {
        out.write(54);
    }
    try (InputStream in = fileSet1.getInputLocations().get(0).getInputStream()) {
        Assert.assertEquals(42, in.read());
    }
    try (InputStream in = fileSet2.getInputLocations().get(0).getInputStream()) {
        Assert.assertEquals(54, in.read());
    }
}
Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Aggregations

FileSet (io.cdap.cdap.api.dataset.lib.FileSet)90 Location (org.apache.twill.filesystem.Location)56 Test (org.junit.Test)44 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)26 HashMap (java.util.HashMap)26 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)24 DatasetId (io.cdap.cdap.proto.id.DatasetId)22 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)16 ApplicationManager (io.cdap.cdap.test.ApplicationManager)16 Table (io.cdap.cdap.api.dataset.table.Table)14 WorkflowManager (io.cdap.cdap.test.WorkflowManager)14 ColumnDesc (io.cdap.cdap.proto.ColumnDesc)12 QueryResult (io.cdap.cdap.proto.QueryResult)12 SparkManager (io.cdap.cdap.test.SparkManager)12 File (java.io.File)10 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)8 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)8 IOException (java.io.IOException)8 PrintStream (java.io.PrintStream)8 PrintWriter (java.io.PrintWriter)8