use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedTextFile.
// this tests mainly the support for different text formats. Other features (partitioning etc.) are tested above.
private void testPartitionedTextFile(String name, String format, String delim, String fileDelim) throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset(name);
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
PartitionedFileSetProperties.Builder builder = (PartitionedFileSetProperties.Builder) PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(name).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value INT").setExploreFormat(format);
if (delim != null) {
builder.setExploreFormatProperty("delimiter", delim);
}
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, builder.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
Location location1 = fileSet.getLocation("file1/nn");
FileWriterHelper.generateTextFile(location1.getOutputStream(), fileDelim, "x", 1, 2);
PartitionKey key1 = PartitionKey.builder().addIntField("number", 1).build();
addPartition(partitioned, key1, "file1");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
// drop a partition and query again
dropPartition(partitioned, key1);
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class FileSetWordCountTest method testWordCountOnFileSet.
@Test
public void testWordCountOnFileSet() throws Exception {
// deploy the application
ApplicationManager applicationManager = deployApplication(FileSetExample.class);
final String line1 = "a b a";
final String line2 = "b a b";
// discover the file set service
ServiceManager serviceManager = applicationManager.getServiceManager("FileSetService").start();
serviceManager.waitForStatus(true);
URL serviceURL = serviceManager.getServiceURL();
// write a file to the file set using the service
HttpURLConnection connection = (HttpURLConnection) new URL(serviceURL, "lines?path=nn.1").openConnection();
try {
connection.setDoOutput(true);
connection.setRequestMethod("PUT");
connection.getOutputStream().write(line1.getBytes(Charsets.UTF_8));
Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
} finally {
connection.disconnect();
}
// run word count over that file only
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
FileSetArguments.setInputPaths(inputArgs, "nn.1");
Map<String, String> outputArgs = Maps.newHashMap();
FileSetArguments.setOutputPath(outputArgs, "out.1");
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "lines", inputArgs));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "counts", outputArgs));
MapReduceManager mapReduceManager = applicationManager.getMapReduceManager("WordCount").start(runtimeArguments);
mapReduceManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// retrieve the counts through the service and verify
Map<String, Integer> counts = Maps.newHashMap();
connection = (HttpURLConnection) new URL(serviceURL, "counts?path=out.1/part-r-00000").openConnection();
try {
connection.setRequestMethod("GET");
Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
readCounts(connection.getInputStream(), counts);
} finally {
connection.disconnect();
}
// "a b a" should yield "a":2, "b":1
Assert.assertEquals(2, counts.size());
Assert.assertEquals(new Integer(2), counts.get("a"));
Assert.assertEquals(new Integer(1), counts.get("b"));
// write a file to the file set using the dataset directly
DataSetManager<FileSet> linesManager = getDataset("lines");
OutputStream output = linesManager.get().getLocation("nn.2").getOutputStream();
try {
output.write(line2.getBytes(Charsets.UTF_8));
} finally {
output.close();
}
// run word count over both files
FileSetArguments.setInputPath(inputArgs, "nn.1");
FileSetArguments.addInputPath(inputArgs, "nn.2");
FileSetArguments.setOutputPath(outputArgs, "out.2");
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "lines", inputArgs));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "counts", outputArgs));
mapReduceManager = applicationManager.getMapReduceManager("WordCount").start(runtimeArguments);
mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
// retrieve the counts through the dataset API and verify
// write a file to the file set using the dataset directly
DataSetManager<FileSet> countsManager = getDataset("counts");
counts.clear();
Location resultLocation = countsManager.get().getLocation("out.2");
Assert.assertTrue(resultLocation.isDirectory());
List<String> parts = new LinkedList<>();
for (Location child : resultLocation.list()) {
if (child.getName().startsWith("part-")) {
// only read part files, no check sums or done files
parts.add(child.getName());
readCounts(child.getInputStream(), counts);
}
}
// "a b a" and "b a b" should yield "a":3, "b":3
Assert.assertEquals(2, counts.size());
Assert.assertEquals(new Integer(3), counts.get("a"));
Assert.assertEquals(new Integer(3), counts.get("b"));
// retrieve the counts through the service
counts.clear();
for (String part : parts) {
connection = (HttpURLConnection) new URL(serviceURL, "counts?path=out.2/" + part).openConnection();
try {
connection.setRequestMethod("GET");
Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
readCounts(connection.getInputStream(), counts);
} finally {
connection.disconnect();
}
}
// "a b a" and "b a b" should yield "a":3, "b":3
Assert.assertEquals(2, counts.size());
Assert.assertEquals(new Integer(3), counts.get("a"));
Assert.assertEquals(new Integer(3), counts.get("b"));
serviceManager.stop();
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class FileDeleteAction method run.
@Override
public void run(BatchActionContext context) throws Exception {
if (!context.isSuccessful()) {
return;
}
FileSet fileSet = context.getDataset(conf.filesetName);
Pattern pattern = Pattern.compile(conf.deleteRegex);
for (Location fileLocation : fileSet.getBaseLocation().list()) {
if (pattern.matcher(fileLocation.getName()).matches()) {
fileLocation.delete();
}
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TestFrameworkTestRun method testCustomActionDatasetAccess.
@Category(SlowTests.class)
@Test
public void testCustomActionDatasetAccess() throws Exception {
addDatasetInstance("keyValueTable", DatasetWithCustomActionApp.CUSTOM_TABLE);
addDatasetInstance("fileSet", DatasetWithCustomActionApp.CUSTOM_FILESET);
ApplicationManager appManager = deployApplication(DatasetWithCustomActionApp.class);
ServiceManager serviceManager = appManager.getServiceManager(DatasetWithCustomActionApp.CUSTOM_SERVICE).start();
serviceManager.waitForStatus(true);
WorkflowManager workflowManager = appManager.getWorkflowManager(DatasetWithCustomActionApp.CUSTOM_WORKFLOW).start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
appManager.stopAll();
DataSetManager<KeyValueTable> outTableManager = getDataset(DatasetWithCustomActionApp.CUSTOM_TABLE);
KeyValueTable outputTable = outTableManager.get();
Assert.assertEquals("world", Bytes.toString(outputTable.read("hello")));
Assert.assertEquals("service", Bytes.toString(outputTable.read("hi")));
Assert.assertEquals("another.world", Bytes.toString(outputTable.read("another.hello")));
DataSetManager<FileSet> outFileSetManager = getDataset(DatasetWithCustomActionApp.CUSTOM_FILESET);
FileSet fs = outFileSetManager.get();
try (InputStream in = fs.getLocation("test").getInputStream()) {
Assert.assertEquals(42, in.read());
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class AdminAppTestRun method testAdminProgram.
private <T extends ProgramManager<T>> void testAdminProgram(ProgramManager<T> manager) throws Exception {
// create fileset b; it will be updated by the worker
addDatasetInstance(FileSet.class.getName(), "b", FileSetProperties.builder().setBasePath("some/path").setInputFormat(TextInputFormat.class).build());
DataSetManager<FileSet> bManager = getDataset("b");
String bFormat = bManager.get().getInputFormatClassName();
String bPath = bManager.get().getBaseLocation().toURI().getPath();
Assert.assertTrue(bPath.endsWith("some/path/"));
bManager.flush();
// create table c and write some data to it; it will be truncated by the worker
addDatasetInstance("table", "c");
DataSetManager<Table> cManager = getDataset("c");
cManager.get().put(new Put("x", "y", "z"));
cManager.flush();
// create table d; it will be dropped by the worker
addDatasetInstance("table", "d");
// start the worker and wait for it to finish
File newBasePath = new File(TMP_FOLDER.newFolder(), "extra");
Assert.assertFalse(newBasePath.exists());
manager.start(ImmutableMap.of("new.base.path", newBasePath.getPath()));
manager.waitForRun(ProgramRunStatus.COMPLETED, 30, TimeUnit.SECONDS);
// validate that worker created dataset a
DataSetManager<Table> aManager = getDataset("a");
Assert.assertNull(aManager.get().scan(null, null).next());
aManager.flush();
// validate that worker update fileset b, Get a new instance of b
bManager = getDataset("b");
Assert.assertEquals(bFormat, bManager.get().getInputFormatClassName());
String newBPath = bManager.get().getBaseLocation().toURI().getPath();
Assert.assertTrue(newBPath.endsWith("/extra/"));
// make sure the directory was created by fileset update (by moving the existing base path)
Assert.assertTrue(newBasePath.exists());
bManager.flush();
// validate that dataset c is empty
Assert.assertNull(cManager.get().scan(null, null).next());
cManager.flush();
// validate that dataset d is gone
Assert.assertNull(getDataset("d").get());
// run the worker again to drop all datasets
manager.start(ImmutableMap.of("dropAll", "true"));
manager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 30, TimeUnit.SECONDS);
Assert.assertNull(getDataset("a").get());
Assert.assertNull(getDataset("b").get());
Assert.assertNull(getDataset("c").get());
Assert.assertNull(getDataset("d").get());
}
Aggregations