use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkTestRun method testSparkWithGetDataset.
private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
ApplicationManager applicationManager = deploy(appClass);
DataSetManager<FileSet> filesetManager = getDataset("logs");
FileSet fileset = filesetManager.get();
Location location = fileset.getLocation("nn");
prepareInputFileSetWithLogData(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
args.put("input", "logs");
args.put("output", "logStats");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
KeyValueTable logStatsTable = logStatsManager.get();
validateGetDatasetOutput(logStatsTable);
// Cleanup after run
location.delete(true);
logStatsManager.flush();
try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
while (scan.hasNext()) {
logStatsTable.delete(scan.next().getKey());
}
}
logStatsManager.flush();
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkTestRun method testStreamFormatSpec.
@Test
public void testStreamFormatSpec() throws Exception {
ApplicationManager appManager = deploy(TestSparkApp.class);
StreamManager stream = getStreamManager("PeopleStream");
stream.send("Old Man,50");
stream.send("Baby,1");
stream.send("Young Guy,18");
stream.send("Small Kid,5");
stream.send("Legal Drinker,21");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(outputArgs, "output");
Map<String, String> runtimeArgs = new HashMap<>();
runtimeArgs.putAll(RuntimeArguments.addScope(Scope.DATASET, "PeopleFileSet", outputArgs));
runtimeArgs.put("stream.name", "PeopleStream");
runtimeArgs.put("output.dataset", "PeopleFileSet");
runtimeArgs.put("sql.statement", "SELECT name, age FROM people WHERE age >= 21");
List<String> programs = Arrays.asList(ScalaStreamFormatSpecSpark.class.getSimpleName(), StreamFormatSpecSpark.class.getSimpleName());
for (String sparkProgramName : programs) {
// Clean the output before starting
DataSetManager<FileSet> fileSetManager = getDataset("PeopleFileSet");
Location outputDir = fileSetManager.get().getLocation("output");
outputDir.delete(true);
SparkManager sparkManager = appManager.getSparkManager(sparkProgramName);
sparkManager.start(runtimeArgs);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 180, TimeUnit.SECONDS);
// Find the output part file. There is only one because the program repartition to 1
Location outputFile = Iterables.find(outputDir.list(), new Predicate<Location>() {
@Override
public boolean apply(Location input) {
return input.getName().startsWith("part-r-");
}
});
// Verify the result
List<String> lines = CharStreams.readLines(CharStreams.newReaderSupplier(Locations.newInputSupplier(outputFile), Charsets.UTF_8));
Map<String, Integer> result = new HashMap<>();
for (String line : lines) {
String[] parts = line.split(":");
result.put(parts[0], Integer.parseInt(parts[1]));
}
Assert.assertEquals(ImmutableMap.of("Old Man", 50, "Legal Drinker", 21), result);
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithFileSet.
private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<FileSet> filesetManager = getDataset("fs");
FileSet fileset = filesetManager.get();
Location location = fileset.getLocation("nn");
prepareFileInput(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(inputArgs, "xx");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs));
args.put("input", "fs");
args.put("output", "fs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES);
validateFileOutput(fileset.getLocation("xx"), "custom:");
// Cleanup paths after running test
fileset.getLocation("nn").delete(true);
fileset.getLocation("xx").delete(true);
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class DecisionTreeRegressionAppTest method test.
@Test
public void test() throws Exception {
// Deploy the Application
ApplicationManager appManager = deployApplication(DecisionTreeRegressionApp.class);
// Start the Service
ServiceManager serviceManager = appManager.getServiceManager(ModelDataService.SERVICE_NAME).start();
serviceManager.waitForStatus(true, 30, 1);
URL serviceURL = serviceManager.getServiceURL(15, TimeUnit.SECONDS);
URL addDataURL = new URL(serviceURL, "labels");
HttpRequest request = HttpRequest.builder(HttpMethod.PUT, addDataURL).withBody(new InputSupplier<InputStream>() {
@Override
public InputStream getInput() throws IOException {
return getClass().getClassLoader().getResourceAsStream("sample_libsvm_data.txt");
}
}).build();
HttpResponse response = HttpRequests.execute(request);
Assert.assertEquals(200, response.getResponseCode());
// Start a Spark Program
SparkManager sparkManager = appManager.getSparkManager(ModelTrainer.NAME).start();
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
// Check that there is a new model
URL listModelsURL = new URL(serviceURL, "models");
request = HttpRequest.builder(HttpMethod.GET, listModelsURL).build();
response = HttpRequests.execute(request);
Assert.assertEquals(200, response.getResponseCode());
List<String> models = GSON.fromJson(response.getResponseBodyAsString(), new TypeToken<List<String>>() {
}.getType());
Assert.assertEquals(1, models.size());
// Check that there is some model metadata
String modelId = models.get(0);
URL modelMetaURL = new URL(serviceURL, "models/" + modelId);
request = HttpRequest.builder(HttpMethod.GET, modelMetaURL).build();
response = HttpRequests.execute(request);
Assert.assertEquals(200, response.getResponseCode());
ModelMeta meta = GSON.fromJson(response.getResponseBodyAsString(), ModelMeta.class);
Assert.assertNotNull(meta);
Assert.assertEquals(0.7, meta.getTrainingPercentage(), 0.000001);
Assert.assertEquals(692, meta.getNumFeatures());
// Check that the corresponding model file exists
DataSetManager<FileSet> modelFiles = getDataset(DecisionTreeRegressionApp.MODEL_DATASET);
Assert.assertTrue(modelFiles.get().getBaseLocation().append(modelId).exists());
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class PartitionedFileSetDefinition method getDataset.
@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
// properties must contain the partitioning
Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
// make any necessary updates to the arguments
arguments = updateArgumentsIfNeeded(arguments, partitioning);
FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}
Aggregations