Examples with FileSet - co.cask.cdap.api.dataset.lib.FileSet

Example 21 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testPartitionedTextFile.

// this tests mainly the support for different text formats. Other features (partitioning etc.) are tested above.
private void testPartitionedTextFile(String name, String format, String delim, String fileDelim) throws Exception {
    final DatasetId datasetInstanceId = NAMESPACE_ID.dataset(name);
    final String tableName = getDatasetHiveName(datasetInstanceId);
    // create a time partitioned file set
    PartitionedFileSetProperties.Builder builder = (PartitionedFileSetProperties.Builder) PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(name).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value INT").setExploreFormat(format);
    if (delim != null) {
        builder.setExploreFormatProperty("delimiter", delim);
    }
    datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, builder.build());
    // verify that the hive table was created for this file set
    runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
    // Accessing dataset instance to perform data operations
    PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(partitioned);
    FileSet fileSet = partitioned.getEmbeddedFileSet();
    // add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
    Location location1 = fileSet.getLocation("file1/nn");
    FileWriterHelper.generateTextFile(location1.getOutputStream(), fileDelim, "x", 1, 2);
    PartitionKey key1 = PartitionKey.builder().addIntField("number", 1).build();
    addPartition(partitioned, key1, "file1");
    // verify that the partitions were added to Hive
    runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
    // verify that we can query the key-values in the file with Hive
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
    // drop a partition and query again
    dropPartition(partitioned, key1);
    // drop the dataset
    datasetFramework.deleteInstance(datasetInstanceId);
    // verify the Hive table is gone
    runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}

Also used : QueryResult(co.cask.cdap.proto.QueryResult) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionedFileSetProperties(co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) ColumnDesc(co.cask.cdap.proto.ColumnDesc) DatasetId(co.cask.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location)

Example 22 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetWordCountTest method testWordCountOnFileSet.

@Test
public void testWordCountOnFileSet() throws Exception {
    // deploy the application
    ApplicationManager applicationManager = deployApplication(FileSetExample.class);
    final String line1 = "a b a";
    final String line2 = "b a b";
    // discover the file set service
    ServiceManager serviceManager = applicationManager.getServiceManager("FileSetService").start();
    serviceManager.waitForStatus(true);
    URL serviceURL = serviceManager.getServiceURL();
    // write a file to the file set using the service
    HttpURLConnection connection = (HttpURLConnection) new URL(serviceURL, "lines?path=nn.1").openConnection();
    try {
        connection.setDoOutput(true);
        connection.setRequestMethod("PUT");
        connection.getOutputStream().write(line1.getBytes(Charsets.UTF_8));
        Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
    } finally {
        connection.disconnect();
    }
    // run word count over that file only
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    FileSetArguments.setInputPaths(inputArgs, "nn.1");
    Map<String, String> outputArgs = Maps.newHashMap();
    FileSetArguments.setOutputPath(outputArgs, "out.1");
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "lines", inputArgs));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "counts", outputArgs));
    MapReduceManager mapReduceManager = applicationManager.getMapReduceManager("WordCount").start(runtimeArguments);
    mapReduceManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // retrieve the counts through the service and verify
    Map<String, Integer> counts = Maps.newHashMap();
    connection = (HttpURLConnection) new URL(serviceURL, "counts?path=out.1/part-r-00000").openConnection();
    try {
        connection.setRequestMethod("GET");
        Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
        readCounts(connection.getInputStream(), counts);
    } finally {
        connection.disconnect();
    }
    // "a b a" should yield "a":2, "b":1
    Assert.assertEquals(2, counts.size());
    Assert.assertEquals(new Integer(2), counts.get("a"));
    Assert.assertEquals(new Integer(1), counts.get("b"));
    // write a file to the file set using the dataset directly
    DataSetManager<FileSet> linesManager = getDataset("lines");
    OutputStream output = linesManager.get().getLocation("nn.2").getOutputStream();
    try {
        output.write(line2.getBytes(Charsets.UTF_8));
    } finally {
        output.close();
    }
    // run word count over both files
    FileSetArguments.setInputPath(inputArgs, "nn.1");
    FileSetArguments.addInputPath(inputArgs, "nn.2");
    FileSetArguments.setOutputPath(outputArgs, "out.2");
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "lines", inputArgs));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, "counts", outputArgs));
    mapReduceManager = applicationManager.getMapReduceManager("WordCount").start(runtimeArguments);
    mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
    // retrieve the counts through the dataset API and verify
    // write a file to the file set using the dataset directly
    DataSetManager<FileSet> countsManager = getDataset("counts");
    counts.clear();
    Location resultLocation = countsManager.get().getLocation("out.2");
    Assert.assertTrue(resultLocation.isDirectory());
    List<String> parts = new LinkedList<>();
    for (Location child : resultLocation.list()) {
        if (child.getName().startsWith("part-")) {
            // only read part files, no check sums or done files
            parts.add(child.getName());
            readCounts(child.getInputStream(), counts);
        }
    }
    // "a b a" and "b a b" should yield "a":3, "b":3
    Assert.assertEquals(2, counts.size());
    Assert.assertEquals(new Integer(3), counts.get("a"));
    Assert.assertEquals(new Integer(3), counts.get("b"));
    // retrieve the counts through the service
    counts.clear();
    for (String part : parts) {
        connection = (HttpURLConnection) new URL(serviceURL, "counts?path=out.2/" + part).openConnection();
        try {
            connection.setRequestMethod("GET");
            Assert.assertEquals(HttpURLConnection.HTTP_OK, connection.getResponseCode());
            readCounts(connection.getInputStream(), counts);
        } finally {
            connection.disconnect();
        }
    }
    // "a b a" and "b a b" should yield "a":3, "b":3
    Assert.assertEquals(2, counts.size());
    Assert.assertEquals(new Integer(3), counts.get("a"));
    Assert.assertEquals(new Integer(3), counts.get("b"));
    serviceManager.stop();
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) MapReduceManager(co.cask.cdap.test.MapReduceManager) FileSet(co.cask.cdap.api.dataset.lib.FileSet) OutputStream(java.io.OutputStream) URL(java.net.URL) LinkedList(java.util.LinkedList) HttpURLConnection(java.net.HttpURLConnection) ServiceManager(co.cask.cdap.test.ServiceManager) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 23 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileDeleteAction method run.

@Override
public void run(BatchActionContext context) throws Exception {
    if (!context.isSuccessful()) {
        return;
    }
    FileSet fileSet = context.getDataset(conf.filesetName);
    Pattern pattern = Pattern.compile(conf.deleteRegex);
    for (Location fileLocation : fileSet.getBaseLocation().list()) {
        if (pattern.matcher(fileLocation.getName()).matches()) {
            fileLocation.delete();
        }
    }
}

Also used : Pattern(java.util.regex.Pattern) FileSet(co.cask.cdap.api.dataset.lib.FileSet) Location(org.apache.twill.filesystem.Location)

Example 24 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TestFrameworkTestRun method testCustomActionDatasetAccess.

@Category(SlowTests.class)
@Test
public void testCustomActionDatasetAccess() throws Exception {
    addDatasetInstance("keyValueTable", DatasetWithCustomActionApp.CUSTOM_TABLE);
    addDatasetInstance("fileSet", DatasetWithCustomActionApp.CUSTOM_FILESET);
    ApplicationManager appManager = deployApplication(DatasetWithCustomActionApp.class);
    ServiceManager serviceManager = appManager.getServiceManager(DatasetWithCustomActionApp.CUSTOM_SERVICE).start();
    serviceManager.waitForStatus(true);
    WorkflowManager workflowManager = appManager.getWorkflowManager(DatasetWithCustomActionApp.CUSTOM_WORKFLOW).start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    appManager.stopAll();
    DataSetManager<KeyValueTable> outTableManager = getDataset(DatasetWithCustomActionApp.CUSTOM_TABLE);
    KeyValueTable outputTable = outTableManager.get();
    Assert.assertEquals("world", Bytes.toString(outputTable.read("hello")));
    Assert.assertEquals("service", Bytes.toString(outputTable.read("hi")));
    Assert.assertEquals("another.world", Bytes.toString(outputTable.read("another.hello")));
    DataSetManager<FileSet> outFileSetManager = getDataset(DatasetWithCustomActionApp.CUSTOM_FILESET);
    FileSet fs = outFileSetManager.get();
    try (InputStream in = fs.getLocation("test").getInputStream()) {
        Assert.assertEquals(42, in.read());
    }
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) FileSet(co.cask.cdap.api.dataset.lib.FileSet) ServiceManager(co.cask.cdap.test.ServiceManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) InputStream(java.io.InputStream) WorkflowManager(co.cask.cdap.test.WorkflowManager) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 25 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class AdminAppTestRun method testAdminProgram.

private <T extends ProgramManager<T>> void testAdminProgram(ProgramManager<T> manager) throws Exception {
    // create fileset b; it will be updated by the worker
    addDatasetInstance(FileSet.class.getName(), "b", FileSetProperties.builder().setBasePath("some/path").setInputFormat(TextInputFormat.class).build());
    DataSetManager<FileSet> bManager = getDataset("b");
    String bFormat = bManager.get().getInputFormatClassName();
    String bPath = bManager.get().getBaseLocation().toURI().getPath();
    Assert.assertTrue(bPath.endsWith("some/path/"));
    bManager.flush();
    // create table c and write some data to it; it will be truncated by the worker
    addDatasetInstance("table", "c");
    DataSetManager<Table> cManager = getDataset("c");
    cManager.get().put(new Put("x", "y", "z"));
    cManager.flush();
    // create table d; it will be dropped by the worker
    addDatasetInstance("table", "d");
    // start the worker and wait for it to finish
    File newBasePath = new File(TMP_FOLDER.newFolder(), "extra");
    Assert.assertFalse(newBasePath.exists());
    manager.start(ImmutableMap.of("new.base.path", newBasePath.getPath()));
    manager.waitForRun(ProgramRunStatus.COMPLETED, 30, TimeUnit.SECONDS);
    // validate that worker created dataset a
    DataSetManager<Table> aManager = getDataset("a");
    Assert.assertNull(aManager.get().scan(null, null).next());
    aManager.flush();
    // validate that worker update fileset b, Get a new instance of b
    bManager = getDataset("b");
    Assert.assertEquals(bFormat, bManager.get().getInputFormatClassName());
    String newBPath = bManager.get().getBaseLocation().toURI().getPath();
    Assert.assertTrue(newBPath.endsWith("/extra/"));
    // make sure the directory was created by fileset update (by moving the existing base path)
    Assert.assertTrue(newBasePath.exists());
    bManager.flush();
    // validate that dataset c is empty
    Assert.assertNull(cManager.get().scan(null, null).next());
    cManager.flush();
    // validate that dataset d is gone
    Assert.assertNull(getDataset("d").get());
    // run the worker again to drop all datasets
    manager.start(ImmutableMap.of("dropAll", "true"));
    manager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 30, TimeUnit.SECONDS);
    Assert.assertNull(getDataset("a").get());
    Assert.assertNull(getDataset("b").get());
    Assert.assertNull(getDataset("c").get());
    Assert.assertNull(getDataset("d").get());
}

Also used : Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) File(java.io.File) Put(co.cask.cdap.api.dataset.table.Put)

Aggregations

FileSet (co.cask.cdap.api.dataset.lib.FileSet)40 Location (org.apache.twill.filesystem.Location)28 Test (org.junit.Test)19 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)12 HashMap (java.util.HashMap)11 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)8 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 ApplicationManager (co.cask.cdap.test.ApplicationManager)8 DatasetId (co.cask.cdap.proto.id.DatasetId)7 ColumnDesc (co.cask.cdap.proto.ColumnDesc)6 QueryResult (co.cask.cdap.proto.QueryResult)6 SparkManager (co.cask.cdap.test.SparkManager)5 WorkflowManager (co.cask.cdap.test.WorkflowManager)5 Table (co.cask.cdap.api.dataset.table.Table)4 ServiceManager (co.cask.cdap.test.ServiceManager)4 IOException (java.io.IOException)4 Map (java.util.Map)4 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3 PartitionedFileSetProperties (co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties)3 FileSetDataset (co.cask.cdap.data2.dataset2.lib.file.FileSetDataset)3