Examples with FileSet - co.cask.cdap.api.dataset.lib.FileSet

Example 16 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkTestRun method testStreamFormatSpec.

@Test
public void testStreamFormatSpec() throws Exception {
    ApplicationManager appManager = deploy(TestSparkApp.class);
    StreamManager stream = getStreamManager("PeopleStream");
    stream.send("Old Man,50");
    stream.send("Baby,1");
    stream.send("Young Guy,18");
    stream.send("Small Kid,5");
    stream.send("Legal Drinker,21");
    Map<String, String> outputArgs = new HashMap<>();
    FileSetArguments.setOutputPath(outputArgs, "output");
    Map<String, String> runtimeArgs = new HashMap<>();
    runtimeArgs.putAll(RuntimeArguments.addScope(Scope.DATASET, "PeopleFileSet", outputArgs));
    runtimeArgs.put("stream.name", "PeopleStream");
    runtimeArgs.put("output.dataset", "PeopleFileSet");
    runtimeArgs.put("sql.statement", "SELECT name, age FROM people WHERE age >= 21");
    List<String> programs = Arrays.asList(ScalaStreamFormatSpecSpark.class.getSimpleName(), StreamFormatSpecSpark.class.getSimpleName());
    for (String sparkProgramName : programs) {
        // Clean the output before starting
        DataSetManager<FileSet> fileSetManager = getDataset("PeopleFileSet");
        Location outputDir = fileSetManager.get().getLocation("output");
        outputDir.delete(true);
        SparkManager sparkManager = appManager.getSparkManager(sparkProgramName);
        sparkManager.start(runtimeArgs);
        sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 180, TimeUnit.SECONDS);
        // Find the output part file. There is only one because the program repartition to 1
        Location outputFile = Iterables.find(outputDir.list(), new Predicate<Location>() {

            @Override
            public boolean apply(Location input) {
                return input.getName().startsWith("part-r-");
            }
        });
        // Verify the result
        List<String> lines = CharStreams.readLines(CharStreams.newReaderSupplier(Locations.newInputSupplier(outputFile), Charsets.UTF_8));
        Map<String, Integer> result = new HashMap<>();
        for (String line : lines) {
            String[] parts = line.split(":");
            result.put(parts[0], Integer.parseInt(parts[1]));
        }
        Assert.assertEquals(ImmutableMap.of("Old Man", 50, "Legal Drinker", 21), result);
    }
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) FileSet(co.cask.cdap.api.dataset.lib.FileSet) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) ScalaStreamFormatSpecSpark(co.cask.cdap.spark.app.ScalaStreamFormatSpecSpark) StreamFormatSpecSpark(co.cask.cdap.spark.app.StreamFormatSpecSpark) StreamManager(co.cask.cdap.test.StreamManager) ScalaStreamFormatSpecSpark(co.cask.cdap.spark.app.ScalaStreamFormatSpecSpark) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 17 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class MapReduceProgramRunnerTest method testMapreduceWithFile.

private void testMapreduceWithFile(String inputDatasetName, String inputPaths, String outputDatasetName, String outputPath, Class appClass, Class mrClass, Map<String, String> extraRuntimeArgs, @Nullable final String counterTableName, @Nullable final String outputSeparator) throws Exception {
    final ApplicationWithPrograms app = deployApp(appClass, new AppWithMapReduceUsingFileSet.AppConfig(inputDatasetName, outputDatasetName));
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    FileSetArguments.setInputPaths(inputArgs, inputPaths);
    FileSetArguments.setOutputPath(outputArgs, outputPath);
    if (outputSeparator != null) {
        outputArgs.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + TextOutputFormat.SEPERATOR, "#");
    }
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, inputDatasetName, inputArgs));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, outputDatasetName, outputArgs));
    if (extraRuntimeArgs != null) {
        runtimeArguments.putAll(extraRuntimeArgs);
    }
    // clear the counters in case a previous test case left behind some values
    if (counterTableName != null) {
        Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", new Runnable() {

            @Override
            public void run() {
                KeyValueTable counters = datasetCache.getDataset(counterTableName);
                counters.delete(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS);
                counters.delete(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS);
            }
        });
    }
    // write a handful of numbers to a file; compute their sum, too.
    final long[] values = { 15L, 17L, 7L, 3L };
    final FileSet input = datasetCache.getDataset(inputDatasetName, inputArgs);
    long sum = 0L, count = 1;
    long inputRecords = 0;
    for (Location inputLocation : input.getInputLocations()) {
        final PrintWriter writer = new PrintWriter(inputLocation.getOutputStream());
        for (long value : values) {
            value *= count;
            writer.println(value);
            sum += value;
            inputRecords++;
        }
        writer.close();
        count++;
    }
    runProgram(app, mrClass, new BasicArguments(runtimeArguments));
    // output location in file system is a directory that contains a part file, a _SUCCESS file, and checksums
    // (.<filename>.crc) for these files. Find the actual part file. Its name begins with "part". In this case,
    // there should be only one part file (with this small data, we have a single reducer).
    final FileSet results = datasetCache.getDataset(outputDatasetName, outputArgs);
    Location resultLocation = results.getOutputLocation();
    if (resultLocation.isDirectory()) {
        for (Location child : resultLocation.list()) {
            if (!child.isDirectory() && child.getName().startsWith("part")) {
                resultLocation = child;
                break;
            }
        }
    }
    Assert.assertFalse(resultLocation.isDirectory());
    // read output and verify result
    String line = CharStreams.readFirstLine(CharStreams.newReaderSupplier(Locations.newInputSupplier(resultLocation), Charsets.UTF_8));
    Assert.assertNotNull(line);
    String[] fields = line.split(outputSeparator == null ? ":" : outputSeparator);
    Assert.assertEquals(2, fields.length);
    Assert.assertEquals(AppWithMapReduceUsingFileSet.FileMapper.ONLY_KEY, fields[0]);
    Assert.assertEquals(sum, Long.parseLong(fields[1]));
    if (counterTableName != null) {
        final long totalInputRecords = inputRecords;
        Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", new Runnable() {

            @Override
            public void run() {
                KeyValueTable counters = datasetCache.getDataset(counterTableName);
                Assert.assertEquals(totalInputRecords, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS, 0L));
                Assert.assertEquals(1L, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS, 0L));
            }
        });
    }
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) Location(org.apache.twill.filesystem.Location) PrintWriter(java.io.PrintWriter)

Example 18 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class MapReduceWithMultipleOutputsTest method testMultipleOutputs.

@Test
public void testMultipleOutputs() throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingMultipleOutputs.class);
    final FileSet fileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.PURCHASES);
    Location inputFile = fileSet.getBaseLocation().append("inputFile");
    inputFile.createNew();
    PrintWriter writer = new PrintWriter(inputFile.getOutputStream());
    // the PURCHASES dataset consists of purchase records in the format: <customerId> <spend>
    writer.println("1 20");
    writer.println("1 65");
    writer.println("1 30");
    writer.println("2 5");
    writer.println("2 53");
    writer.println("2 45");
    writer.println("3 101");
    writer.close();
    // Using multiple outputs, this MapReduce send the records to a different path of the same dataset, depending
    // on the value in the data (large spend amounts will go to one file, while small will go to another file.
    runProgram(app, AppWithMapReduceUsingMultipleOutputs.SeparatePurchases.class, new BasicArguments());
    FileSet outputFileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.SEPARATED_PURCHASES);
    Assert.assertEquals(ImmutableList.of("1 20", "1 30", "2 5", "2 45"), readFromOutput(outputFileSet, "small_purchases"));
    Assert.assertEquals(ImmutableList.of("1 65", "2 53", "3 101"), readFromOutput(outputFileSet, "large_purchases"));
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) Location(org.apache.twill.filesystem.Location) PrintWriter(java.io.PrintWriter) Test(org.junit.Test)

Example 19 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testPartitionedTextFileUpdate.

@Test
public void testPartitionedTextFileUpdate() throws Exception {
    final DatasetId datasetId = NAMESPACE_ID.dataset("txtupd");
    final String tableName = getDatasetHiveName(datasetId);
    // create a time partitioned file set
    datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("csv").build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(partitioned);
    FileSet fileSet = partitioned.getEmbeddedFileSet();
    // add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
    Location location1 = fileSet.getLocation("file1/nn");
    FileWriterHelper.generateMultiDelimitersFile(location1.getOutputStream(), ImmutableList.of(",", "\1", ":"), 1, 2);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 1).build(), "file1");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
    // verify that we can query the key-values in the file with Hive.
    List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
    new QueryResult(Lists.<Object>newArrayList("1", "x\1x:1", 1))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").build());
    // add another partition
    Location location2 = fileSet.getLocation("file2/nn");
    FileWriterHelper.generateMultiDelimitersFile(location2.getOutputStream(), ImmutableList.of(",", "\1", ":"), 2, 3);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 2).build(), "file2");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=2", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=2,x value=x:2
    new QueryResult(Lists.<Object>newArrayList("2,x", "x:2", 2))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("text").setExploreFormatProperty("delimiter", ":").build());
    // add another partition
    Location location3 = fileSet.getLocation("file3/nn");
    FileWriterHelper.generateMultiDelimitersFile(location3.getOutputStream(), ImmutableList.of(",", "\1", ":"), 3, 4);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 3).build(), "file3");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=3", true, expectedColumns, Lists.newArrayList(// text line has the form 2,x\1x:2, format is text -> key=3,x\1x value=3
    new QueryResult(Lists.<Object>newArrayList("3,x\1x", "3", 3))));
    // update the dataset properties with a different format (avro)
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // add another partition
    Location location4 = fileSet.getLocation("file4/nn");
    FileWriterHelper.generateAvroFile(location4.getOutputStream(), "x", 4, 5);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 4).build(), "file4");
    // new partition should have new format, validate with query
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
    new QueryResult(Lists.<Object>newArrayList("x4", "#4", 4))));
}

Also used : QueryResult(co.cask.cdap.proto.QueryResult) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) ColumnDesc(co.cask.cdap.proto.ColumnDesc) DatasetId(co.cask.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 20 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testPartitionedTextSchemaUpdate.

@Test
public void testPartitionedTextSchemaUpdate() throws Exception {
    final DatasetId datasetId = NAMESPACE_ID.dataset("txtschemaupd");
    final String tableName = getDatasetHiveName(datasetId);
    // create a time partitioned file set
    datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value STRING").setExploreFormat("csv").build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(partitioned);
    FileSet fileSet = partitioned.getEmbeddedFileSet();
    // add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
    Location location1 = fileSet.getLocation("file1/nn");
    FileWriterHelper.generateMultiDelimitersFile(location1.getOutputStream(), ImmutableList.of(",", "\1", ":"), 1, 2);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 1).build(), "file1");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
    // verify that we can query the key-values in the file with Hive.
    List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
    new QueryResult(Lists.<Object>newArrayList("1", "x\1x:1", 1))));
    // update the dataset properties with a different delimiter
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setExploreSchema("str STRING").setExploreFormat("csv").build());
    // new partition should have new schema, validate with query
    expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".str", "STRING", 1, null), new ColumnDesc(tableName + ".number", "INT", 2, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=1", true, expectedColumns, Lists.newArrayList(// text line has the form 1,x\1x:1, format is csv -> key=1 value=x\1x:1
    new QueryResult(Lists.<Object>newArrayList("1", 1))));
}

Aggregations

FileSet (co.cask.cdap.api.dataset.lib.FileSet)40 Location (org.apache.twill.filesystem.Location)28 Test (org.junit.Test)19 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)12 HashMap (java.util.HashMap)11 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)8 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 ApplicationManager (co.cask.cdap.test.ApplicationManager)8 DatasetId (co.cask.cdap.proto.id.DatasetId)7 ColumnDesc (co.cask.cdap.proto.ColumnDesc)6 QueryResult (co.cask.cdap.proto.QueryResult)6 SparkManager (co.cask.cdap.test.SparkManager)5 WorkflowManager (co.cask.cdap.test.WorkflowManager)5 Table (co.cask.cdap.api.dataset.table.Table)4 ServiceManager (co.cask.cdap.test.ServiceManager)4 IOException (java.io.IOException)4 Map (java.util.Map)4 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3 PartitionedFileSetProperties (co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties)3 FileSetDataset (co.cask.cdap.data2.dataset2.lib.file.FileSetDataset)3