Examples with FileSet - co.cask.cdap.api.dataset.lib.FileSet

Example 1 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkTestRun method testSparkWithGetDataset.

private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
    ApplicationManager applicationManager = deploy(appClass);
    DataSetManager<FileSet> filesetManager = getDataset("logs");
    FileSet fileset = filesetManager.get();
    Location location = fileset.getLocation("nn");
    prepareInputFileSetWithLogData(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
    args.put("input", "logs");
    args.put("output", "logStats");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
    KeyValueTable logStatsTable = logStatsManager.get();
    validateGetDatasetOutput(logStatsTable);
    // Cleanup after run
    location.delete(true);
    logStatsManager.flush();
    try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
        while (scan.hasNext()) {
            logStatsTable.delete(scan.next().getKey());
        }
    }
    logStatsManager.flush();
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) FileSet(co.cask.cdap.api.dataset.lib.FileSet) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location)

Example 2 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkTestRun method testStreamFormatSpec.

@Test
public void testStreamFormatSpec() throws Exception {
    ApplicationManager appManager = deploy(TestSparkApp.class);
    StreamManager stream = getStreamManager("PeopleStream");
    stream.send("Old Man,50");
    stream.send("Baby,1");
    stream.send("Young Guy,18");
    stream.send("Small Kid,5");
    stream.send("Legal Drinker,21");
    Map<String, String> outputArgs = new HashMap<>();
    FileSetArguments.setOutputPath(outputArgs, "output");
    Map<String, String> runtimeArgs = new HashMap<>();
    runtimeArgs.putAll(RuntimeArguments.addScope(Scope.DATASET, "PeopleFileSet", outputArgs));
    runtimeArgs.put("stream.name", "PeopleStream");
    runtimeArgs.put("output.dataset", "PeopleFileSet");
    runtimeArgs.put("sql.statement", "SELECT name, age FROM people WHERE age >= 21");
    List<String> programs = Arrays.asList(ScalaStreamFormatSpecSpark.class.getSimpleName(), StreamFormatSpecSpark.class.getSimpleName());
    for (String sparkProgramName : programs) {
        // Clean the output before starting
        DataSetManager<FileSet> fileSetManager = getDataset("PeopleFileSet");
        Location outputDir = fileSetManager.get().getLocation("output");
        outputDir.delete(true);
        SparkManager sparkManager = appManager.getSparkManager(sparkProgramName);
        sparkManager.start(runtimeArgs);
        sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 180, TimeUnit.SECONDS);
        // Find the output part file. There is only one because the program repartition to 1
        Location outputFile = Iterables.find(outputDir.list(), new Predicate<Location>() {

            @Override
            public boolean apply(Location input) {
                return input.getName().startsWith("part-r-");
            }
        });
        // Verify the result
        List<String> lines = CharStreams.readLines(CharStreams.newReaderSupplier(Locations.newInputSupplier(outputFile), Charsets.UTF_8));
        Map<String, Integer> result = new HashMap<>();
        for (String line : lines) {
            String[] parts = line.split(":");
            result.put(parts[0], Integer.parseInt(parts[1]));
        }
        Assert.assertEquals(ImmutableMap.of("Old Man", 50, "Legal Drinker", 21), result);
    }
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) FileSet(co.cask.cdap.api.dataset.lib.FileSet) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) ScalaStreamFormatSpecSpark(co.cask.cdap.spark.app.ScalaStreamFormatSpecSpark) StreamFormatSpecSpark(co.cask.cdap.spark.app.StreamFormatSpecSpark) StreamManager(co.cask.cdap.test.StreamManager) ScalaStreamFormatSpecSpark(co.cask.cdap.spark.app.ScalaStreamFormatSpecSpark) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 3 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithFileSet.

private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    DataSetManager<FileSet> filesetManager = getDataset("fs");
    FileSet fileset = filesetManager.get();
    Location location = fileset.getLocation("nn");
    prepareFileInput(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> outputArgs = new HashMap<>();
    FileSetArguments.setOutputPath(inputArgs, "xx");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs));
    args.put("input", "fs");
    args.put("output", "fs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES);
    validateFileOutput(fileset.getLocation("xx"), "custom:");
    // Cleanup paths after running test
    fileset.getLocation("nn").delete(true);
    fileset.getLocation("xx").delete(true);
}

Also used : SparkManager(co.cask.cdap.test.SparkManager) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) SparkAppUsingFileSet(co.cask.cdap.spark.app.SparkAppUsingFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) Location(org.apache.twill.filesystem.Location)

Example 4 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class DecisionTreeRegressionAppTest method test.

@Test
public void test() throws Exception {
    // Deploy the Application
    ApplicationManager appManager = deployApplication(DecisionTreeRegressionApp.class);
    // Start the Service
    ServiceManager serviceManager = appManager.getServiceManager(ModelDataService.SERVICE_NAME).start();
    serviceManager.waitForStatus(true, 30, 1);
    URL serviceURL = serviceManager.getServiceURL(15, TimeUnit.SECONDS);
    URL addDataURL = new URL(serviceURL, "labels");
    HttpRequest request = HttpRequest.builder(HttpMethod.PUT, addDataURL).withBody(new InputSupplier<InputStream>() {

        @Override
        public InputStream getInput() throws IOException {
            return getClass().getClassLoader().getResourceAsStream("sample_libsvm_data.txt");
        }
    }).build();
    HttpResponse response = HttpRequests.execute(request);
    Assert.assertEquals(200, response.getResponseCode());
    // Start a Spark Program
    SparkManager sparkManager = appManager.getSparkManager(ModelTrainer.NAME).start();
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
    // Check that there is a new model
    URL listModelsURL = new URL(serviceURL, "models");
    request = HttpRequest.builder(HttpMethod.GET, listModelsURL).build();
    response = HttpRequests.execute(request);
    Assert.assertEquals(200, response.getResponseCode());
    List<String> models = GSON.fromJson(response.getResponseBodyAsString(), new TypeToken<List<String>>() {
    }.getType());
    Assert.assertEquals(1, models.size());
    // Check that there is some model metadata
    String modelId = models.get(0);
    URL modelMetaURL = new URL(serviceURL, "models/" + modelId);
    request = HttpRequest.builder(HttpMethod.GET, modelMetaURL).build();
    response = HttpRequests.execute(request);
    Assert.assertEquals(200, response.getResponseCode());
    ModelMeta meta = GSON.fromJson(response.getResponseBodyAsString(), ModelMeta.class);
    Assert.assertNotNull(meta);
    Assert.assertEquals(0.7, meta.getTrainingPercentage(), 0.000001);
    Assert.assertEquals(692, meta.getNumFeatures());
    // Check that the corresponding model file exists
    DataSetManager<FileSet> modelFiles = getDataset(DecisionTreeRegressionApp.MODEL_DATASET);
    Assert.assertTrue(modelFiles.get().getBaseLocation().append(modelId).exists());
}

Also used : HttpRequest(co.cask.common.http.HttpRequest) ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) FileSet(co.cask.cdap.api.dataset.lib.FileSet) HttpResponse(co.cask.common.http.HttpResponse) URL(java.net.URL) ServiceManager(co.cask.cdap.test.ServiceManager) TypeToken(com.google.gson.reflect.TypeToken) InputSupplier(com.google.common.io.InputSupplier) Test(org.junit.Test)

Example 5 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class PartitionedFileSetDefinition method getDataset.

@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
    // properties must contain the partitioning
    Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
    // make any necessary updates to the arguments
    arguments = updateArgumentsIfNeeded(arguments, partitioning);
    FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
    IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
    return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}

Also used : Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IndexedTable(co.cask.cdap.api.dataset.lib.IndexedTable)

Aggregations

FileSet (co.cask.cdap.api.dataset.lib.FileSet)45 Location (org.apache.twill.filesystem.Location)32 Test (org.junit.Test)23 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)13 HashMap (java.util.HashMap)13 ApplicationManager (co.cask.cdap.test.ApplicationManager)10 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)9 DatasetId (co.cask.cdap.proto.id.DatasetId)9 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 SparkManager (co.cask.cdap.test.SparkManager)8 ColumnDesc (co.cask.cdap.proto.ColumnDesc)6 QueryResult (co.cask.cdap.proto.QueryResult)6 WorkflowManager (co.cask.cdap.test.WorkflowManager)5 File (java.io.File)5 Table (co.cask.cdap.api.dataset.table.Table)4 ServiceManager (co.cask.cdap.test.ServiceManager)4 IOException (java.io.IOException)4 IdentityHashMap (java.util.IdentityHashMap)4 Map (java.util.Map)4 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3