use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class FileSetTest method testExternalAbsolutePath.
@Test
public void testExternalAbsolutePath() throws IOException, DatasetManagementException, UnauthorizedException {
// create an external dir and create a file in it
String absolutePath = tmpFolder.newFolder() + "/absolute/path";
File absoluteFile = new File(absolutePath);
absoluteFile.mkdirs();
File someFile = new File(absoluteFile, "some.file");
someFile.createNewFile();
// create an external dataset
dsFrameworkUtil.createInstance("fileSet", testFileSetInstance5, FileSetProperties.builder().setBasePath(absolutePath).setDataExternal(true).build());
// instantiate the file set with an input and output path
Map<String, String> fileArgs = Maps.newHashMap();
FileSetArguments.setInputPath(fileArgs, "some.file");
FileSetArguments.setOutputPath(fileArgs, "out");
FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs);
Assert.assertNotNull(fileSet);
// read the existing file
Location input = fileSet.getInputLocations().iterator().next();
InputStream in = input.getInputStream();
in.close();
// attempt to write an output file
try {
fileSet.getOutputLocation();
Assert.fail("Extrernal file set should not allow writing output.");
} catch (UnsupportedOperationException e) {
// expected
}
// delete the dataset and validate that the files are still there
dsFrameworkUtil.deleteInstance(testFileSetInstance5);
Assert.assertTrue(someFile.exists());
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class FilesetDeletePostAction method run.
@Override
public void run(BatchActionContext context) throws Exception {
if (!context.isSuccessful()) {
return;
}
FileSet fileSet = context.getDataset(config.filesetName);
Pattern pattern = Pattern.compile(config.deleteRegex);
for (Location fileLocation : fileSet.getBaseLocation().append(config.directory).list()) {
if (pattern.matcher(fileLocation.getName()).find()) {
fileLocation.delete();
}
}
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkCSVToSpaceProgram method run.
@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> fileSetArgs = new HashMap<>();
final Metrics metrics = sec.getMetrics();
FileSetArguments.addInputPath(fileSetArgs, sec.getRuntimeArguments().get("input.path"));
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
final List<String> converted = input.values().map(new Function<Text, String>() {
@Override
public String call(Text input) throws Exception {
String line = input.toString();
metrics.count("num.lines", 1);
return line.replaceAll(",", " ");
}
}).collect();
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
Map<String, String> args = sec.getRuntimeArguments();
String outputPath = args.get("output.path");
Map<String, String> fileSetArgs = new HashMap<>();
FileSetArguments.setOutputPath(fileSetArgs, outputPath);
FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) {
for (String line : converted) {
writer.write(line);
writer.println();
}
}
}
});
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TestFrameworkTestRun method testAppWithPlugin.
@Test
public void testAppWithPlugin() throws Exception {
ArtifactId artifactId = NamespaceId.DEFAULT.artifact("app-with-plugin", "1.0.0-SNAPSHOT");
addAppArtifact(artifactId, AppWithPlugin.class);
ArtifactId pluginArtifactId = NamespaceId.DEFAULT.artifact("test-plugin", "1.0.0-SNAPSHOT");
addPluginArtifact(pluginArtifactId, artifactId, ToStringPlugin.class);
ApplicationId appId = NamespaceId.DEFAULT.app("AppWithPlugin");
AppRequest createRequest = new AppRequest(new ArtifactSummary(artifactId.getArtifact(), artifactId.getVersion()));
ApplicationManager appManager = deployApplication(appId, createRequest);
final WorkerManager workerManager = appManager.getWorkerManager(AppWithPlugin.WORKER);
workerManager.start();
workerManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.SECONDS);
final ServiceManager serviceManager = appManager.getServiceManager(AppWithPlugin.SERVICE);
serviceManager.start();
serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
URL serviceURL = serviceManager.getServiceURL(5, TimeUnit.SECONDS);
callServiceGet(serviceURL, "dummy");
serviceManager.stop();
serviceManager.waitForStopped(10, TimeUnit.SECONDS);
WorkflowManager workflowManager = appManager.getWorkflowManager(AppWithPlugin.WORKFLOW);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
List<RunRecord> runRecords = workflowManager.getHistory();
Assert.assertNotEquals(ProgramRunStatus.FAILED, runRecords.get(0).getStatus());
DataSetManager<KeyValueTable> workflowTableManager = getDataset(AppWithPlugin.WORKFLOW_TABLE);
String value = Bytes.toString(workflowTableManager.get().read("val"));
Assert.assertEquals(AppWithPlugin.TEST, value);
Map<String, String> workflowTags = ImmutableMap.of(Constants.Metrics.Tag.NAMESPACE, NamespaceId.DEFAULT.getNamespace(), Constants.Metrics.Tag.APP, "AppWithPlugin", Constants.Metrics.Tag.WORKFLOW, AppWithPlugin.WORKFLOW, Constants.Metrics.Tag.RUN_ID, runRecords.get(0).getPid());
getMetricsManager().waitForTotalMetricCount(workflowTags, String.format("user.destroy.%s", AppWithPlugin.WORKFLOW), 1, 60, TimeUnit.SECONDS);
// Testing Spark Plugins. First send some data to fileset for the Spark program to process
DataSetManager<FileSet> fileSetManager = getDataset(AppWithPlugin.SPARK_INPUT);
FileSet fileSet = fileSetManager.get();
try (PrintStream out = new PrintStream(fileSet.getLocation("input").append("file.txt").getOutputStream(), true, "UTF-8")) {
for (int i = 0; i < 5; i++) {
out.println("Message " + i);
}
}
Map<String, String> sparkArgs = new HashMap<>();
FileSetArguments.setInputPath(sparkArgs, "input");
SparkManager sparkManager = appManager.getSparkManager(AppWithPlugin.SPARK).start(sparkArgs);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
// Verify the Spark result.
DataSetManager<Table> dataSetManager = getDataset(AppWithPlugin.SPARK_TABLE);
Table table = dataSetManager.get();
try (Scanner scanner = table.scan(null, null)) {
for (int i = 0; i < 5; i++) {
Row row = scanner.next();
Assert.assertNotNull(row);
String expected = "Message " + i + " " + AppWithPlugin.TEST;
Assert.assertEquals(expected, Bytes.toString(row.getRow()));
Assert.assertEquals(expected, Bytes.toString(row.get(expected)));
}
// There shouldn't be any more rows in the table.
Assert.assertNull(scanner.next());
}
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TestFrameworkTestRun method testClusterName.
@Test
public void testClusterName() throws Exception {
String clusterName = getConfiguration().get(Constants.CLUSTER_NAME);
ApplicationManager appManager = deployApplication(ClusterNameTestApp.class);
final DataSetManager<KeyValueTable> datasetManager = getDataset(ClusterNameTestApp.CLUSTER_NAME_TABLE);
final KeyValueTable clusterNameTable = datasetManager.get();
// A callable for reading the cluster name from the ClusterNameTable.
// It is used for Tasks.waitFor call down below.
final AtomicReference<String> key = new AtomicReference<>();
Callable<String> readClusterName = new Callable<String>() {
@Nullable
@Override
public String call() throws Exception {
datasetManager.flush();
byte[] bytes = clusterNameTable.read(key.get());
return bytes == null ? null : new String(bytes, StandardCharsets.UTF_8);
}
};
// Service
ServiceManager serviceManager = appManager.getServiceManager(ClusterNameTestApp.ClusterNameServiceHandler.class.getSimpleName()).start();
Assert.assertEquals(clusterName, callServiceGet(serviceManager.getServiceURL(10, TimeUnit.SECONDS), "clusterName"));
serviceManager.stop();
// Worker
WorkerManager workerManager = appManager.getWorkerManager(ClusterNameTestApp.ClusterNameWorker.class.getSimpleName()).start();
key.set("worker.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
// The worker will stop by itself. No need to call stop
workerManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.SECONDS);
// MapReduce
// Setup the input file used by MR
Location location = this.<FileSet>getDataset(ClusterNameTestApp.INPUT_FILE_SET).get().getLocation("input");
try (PrintStream printer = new PrintStream(location.getOutputStream(), true, "UTF-8")) {
for (int i = 0; i < 10; i++) {
printer.println("Hello World " + i);
}
}
// Setup input and output dataset arguments
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "input");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(outputArgs, "output");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.INPUT_FILE_SET, inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.OUTPUT_FILE_SET, outputArgs));
MapReduceManager mrManager = appManager.getMapReduceManager(ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName()).start(args);
key.set("mr.client.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set("mapper.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set("reducer.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
mrManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
// Spark
SparkManager sparkManager = appManager.getSparkManager(ClusterNameTestApp.ClusterNameSpark.class.getSimpleName()).start();
key.set("spark.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
// Workflow
// Cleanup the output path for the MR job in the workflow first
this.<FileSet>getDataset(ClusterNameTestApp.OUTPUT_FILE_SET).get().getLocation("output").delete(true);
args = RuntimeArguments.addScope(Scope.MAPREDUCE, ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName(), args);
WorkflowManager workflowManager = appManager.getWorkflowManager(ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName()).start(args);
String prefix = ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName() + ".";
key.set(prefix + "mr.client.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "mapper.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "reducer.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "spark.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "action.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 120, TimeUnit.SECONDS);
}
Aggregations