use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkTestRun method testStreamFormatSpec.
@Test
public void testStreamFormatSpec() throws Exception {
ApplicationManager appManager = deploy(TestSparkApp.class);
StreamManager stream = getStreamManager("PeopleStream");
stream.send("Old Man,50");
stream.send("Baby,1");
stream.send("Young Guy,18");
stream.send("Small Kid,5");
stream.send("Legal Drinker,21");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(outputArgs, "output");
Map<String, String> runtimeArgs = new HashMap<>();
runtimeArgs.putAll(RuntimeArguments.addScope(Scope.DATASET, "PeopleFileSet", outputArgs));
runtimeArgs.put("stream.name", "PeopleStream");
runtimeArgs.put("output.dataset", "PeopleFileSet");
runtimeArgs.put("sql.statement", "SELECT name, age FROM people WHERE age >= 21");
List<String> programs = Arrays.asList(ScalaStreamFormatSpecSpark.class.getSimpleName(), StreamFormatSpecSpark.class.getSimpleName());
for (String sparkProgramName : programs) {
// Clean the output before starting
DataSetManager<FileSet> fileSetManager = getDataset("PeopleFileSet");
Location outputDir = fileSetManager.get().getLocation("output");
outputDir.delete(true);
SparkManager sparkManager = appManager.getSparkManager(sparkProgramName);
sparkManager.start(runtimeArgs);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 180, TimeUnit.SECONDS);
// Find the output part file. There is only one because the program repartition to 1
Location outputFile = Iterables.find(outputDir.list(), new Predicate<Location>() {
@Override
public boolean apply(Location input) {
return input.getName().startsWith("part-r-");
}
});
// Verify the result
List<String> lines = CharStreams.readLines(CharStreams.newReaderSupplier(Locations.newInputSupplier(outputFile), Charsets.UTF_8));
Map<String, Integer> result = new HashMap<>();
for (String line : lines) {
String[] parts = line.split(":");
result.put(parts[0], Integer.parseInt(parts[1]));
}
Assert.assertEquals(ImmutableMap.of("Old Man", 50, "Legal Drinker", 21), result);
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class MapReduceProgramRunnerTest method testMapreduceWithFile.
private void testMapreduceWithFile(String inputDatasetName, String inputPaths, String outputDatasetName, String outputPath, Class appClass, Class mrClass, Map<String, String> extraRuntimeArgs, @Nullable final String counterTableName, @Nullable final String outputSeparator) throws Exception {
final ApplicationWithPrograms app = deployApp(appClass, new AppWithMapReduceUsingFileSet.AppConfig(inputDatasetName, outputDatasetName));
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
Map<String, String> outputArgs = Maps.newHashMap();
FileSetArguments.setInputPaths(inputArgs, inputPaths);
FileSetArguments.setOutputPath(outputArgs, outputPath);
if (outputSeparator != null) {
outputArgs.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + TextOutputFormat.SEPERATOR, "#");
}
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, inputDatasetName, inputArgs));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, outputDatasetName, outputArgs));
if (extraRuntimeArgs != null) {
runtimeArguments.putAll(extraRuntimeArgs);
}
// clear the counters in case a previous test case left behind some values
if (counterTableName != null) {
Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", new Runnable() {
@Override
public void run() {
KeyValueTable counters = datasetCache.getDataset(counterTableName);
counters.delete(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS);
counters.delete(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS);
}
});
}
// write a handful of numbers to a file; compute their sum, too.
final long[] values = { 15L, 17L, 7L, 3L };
final FileSet input = datasetCache.getDataset(inputDatasetName, inputArgs);
long sum = 0L, count = 1;
long inputRecords = 0;
for (Location inputLocation : input.getInputLocations()) {
final PrintWriter writer = new PrintWriter(inputLocation.getOutputStream());
for (long value : values) {
value *= count;
writer.println(value);
sum += value;
inputRecords++;
}
writer.close();
count++;
}
runProgram(app, mrClass, new BasicArguments(runtimeArguments));
// output location in file system is a directory that contains a part file, a _SUCCESS file, and checksums
// (.<filename>.crc) for these files. Find the actual part file. Its name begins with "part". In this case,
// there should be only one part file (with this small data, we have a single reducer).
final FileSet results = datasetCache.getDataset(outputDatasetName, outputArgs);
Location resultLocation = results.getOutputLocation();
if (resultLocation.isDirectory()) {
for (Location child : resultLocation.list()) {
if (!child.isDirectory() && child.getName().startsWith("part")) {
resultLocation = child;
break;
}
}
}
Assert.assertFalse(resultLocation.isDirectory());
// read output and verify result
String line = CharStreams.readFirstLine(CharStreams.newReaderSupplier(Locations.newInputSupplier(resultLocation), Charsets.UTF_8));
Assert.assertNotNull(line);
String[] fields = line.split(outputSeparator == null ? ":" : outputSeparator);
Assert.assertEquals(2, fields.length);
Assert.assertEquals(AppWithMapReduceUsingFileSet.FileMapper.ONLY_KEY, fields[0]);
Assert.assertEquals(sum, Long.parseLong(fields[1]));
if (counterTableName != null) {
final long totalInputRecords = inputRecords;
Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", new Runnable() {
@Override
public void run() {
KeyValueTable counters = datasetCache.getDataset(counterTableName);
Assert.assertEquals(totalInputRecords, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS, 0L));
Assert.assertEquals(1L, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS, 0L));
}
});
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class MapReduceWithMultipleOutputsTest method testMultipleOutputs.
@Test
public void testMultipleOutputs() throws Exception {
ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingMultipleOutputs.class);
final FileSet fileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.PURCHASES);
Location inputFile = fileSet.getBaseLocation().append("inputFile");
inputFile.createNew();
PrintWriter writer = new PrintWriter(inputFile.getOutputStream());
// the PURCHASES dataset consists of purchase records in the format: <customerId> <spend>
writer.println("1 20");
writer.println("1 65");
writer.println("1 30");
writer.println("2 5");
writer.println("2 53");
writer.println("2 45");
writer.println("3 101");
writer.close();
// Using multiple outputs, this MapReduce send the records to a different path of the same dataset, depending
// on the value in the data (large spend amounts will go to one file, while small will go to another file.
runProgram(app, AppWithMapReduceUsingMultipleOutputs.SeparatePurchases.class, new BasicArguments());
FileSet outputFileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.SEPARATED_PURCHASES);
Assert.assertEquals(ImmutableList.of("1 20", "1 30", "2 5", "2 45"), readFromOutput(outputFileSet, "small_purchases"));
Assert.assertEquals(ImmutableList.of("1 65", "2 53", "3 101"), readFromOutput(outputFileSet, "large_purchases"));
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedTextFileUpdate.
@Test
public void testPartitionedTextFileUpdate() throws Exception {
final DatasetId datasetId = NAMESPACE_ID.dataset("txtupd");
final String tableName = getDatasetHiveName(datasetId);
// create a time partitioned file set
datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("csv").build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
Location location1 = fileSet.getLocation("file1/nn");
FileWriterHelper.generateMultiDelimitersFile(location1.getOutputStream(), ImmutableList.of(",", "\1", ":"), 1, 2);
addPartition(partitioned, PartitionKey.builder().addIntField("number", 1).build(), "file1");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
// verify that we can query the key-values in the file with Hive.
List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
new QueryResult(Lists.<Object>newArrayList("1", "x\1x:1", 1))));
// update the dataset properties with a different delimiter
datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").build());
// add another partition
Location location2 = fileSet.getLocation("file2/nn");
FileWriterHelper.generateMultiDelimitersFile(location2.getOutputStream(), ImmutableList.of(",", "\1", ":"), 2, 3);
addPartition(partitioned, PartitionKey.builder().addIntField("number", 2).build(), "file2");
// new partition should have new format, validate with query
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=2", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=2,x value=x:2
new QueryResult(Lists.<Object>newArrayList("2,x", "x:2", 2))));
// update the dataset properties with a different delimiter
datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").setExploreFormatProperty("delimiter", ":").build());
// add another partition
Location location3 = fileSet.getLocation("file3/nn");
FileWriterHelper.generateMultiDelimitersFile(location3.getOutputStream(), ImmutableList.of(",", "\1", ":"), 3, 4);
addPartition(partitioned, PartitionKey.builder().addIntField("number", 3).build(), "file3");
// new partition should have new format, validate with query
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=3", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=3,x\1x value=3
new QueryResult(Lists.<Object>newArrayList("3,x\1x", "3", 3))));
// update the dataset properties with a different format (avro)
datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// add another partition
Location location4 = fileSet.getLocation("file4/nn");
FileWriterHelper.generateAvroFile(location4.getOutputStream(), "x", 4, 5);
addPartition(partitioned, PartitionKey.builder().addIntField("number", 4).build(), "file4");
// new partition should have new format, validate with query
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
new QueryResult(Lists.<Object>newArrayList("x4", "#4", 4))));
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedTextSchemaUpdate.
@Test
public void testPartitionedTextSchemaUpdate() throws Exception {
final DatasetId datasetId = NAMESPACE_ID.dataset("txtschemaupd");
final String tableName = getDatasetHiveName(datasetId);
// create a time partitioned file set
datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("csv").build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
Location location1 = fileSet.getLocation("file1/nn");
FileWriterHelper.generateMultiDelimitersFile(location1.getOutputStream(), ImmutableList.of(",", "\1", ":"), 1, 2);
addPartition(partitioned, PartitionKey.builder().addIntField("number", 1).build(), "file1");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
// verify that we can query the key-values in the file with Hive.
List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
new QueryResult(Lists.<Object>newArrayList("1", "x\1x:1", 1))));
// update the dataset properties with a different delimiter
datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("str STRING").setExploreFormat("csv").build());
// new partition should have new schema, validate with query
expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".str", "STRING", 1, null), new ColumnDesc(tableName + ".number", "INT", 2, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
new QueryResult(Lists.<Object>newArrayList("1", 1))));
}
Aggregations