Examples with FileSet - co.cask.cdap.api.dataset.lib.FileSet

Example 41 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TextFileSetSource method onRunFinish.

// onRunFinish is called at the end of the pipeline run by the client that submitted the batch job.
@Override
public void onRunFinish(boolean succeeded, BatchSourceContext context) {
    // in our case, we want to delete the data read during this run if the run succeeded.
    if (succeeded && config.deleteInputOnSuccess) {
        Map<String, String> arguments = new HashMap<>();
        FileSetArguments.setInputPaths(arguments, config.files);
        FileSet fileSet = context.getDataset(config.fileSetName, arguments);
        for (Location inputLocation : fileSet.getInputLocations()) {
            try {
                inputLocation.delete(true);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) IOException(java.io.IOException) Location(org.apache.twill.filesystem.Location)

Example 42 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class PipelineTest method testTextFileSinkAndDeletePostAction.

@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
    // create the pipeline config
    String inputName = "sinkTestInput";
    String outputName = "sinkTestOutput";
    String outputDirName = "users";
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
    sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
    sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
    ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
    // mapreduce writes multiple files to the output directory. Along with the actual output,
    // there are various .crc files that do not contain any of the output content.
    actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
    actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
    ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some data to the input fileset
    Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
    Map<String, String> users = new HashMap<>();
    users.put("samuel", "wallet");
    users.put("dwayne", "rock");
    users.put("christopher", "cowbell");
    List<StructuredRecord> inputRecords = new ArrayList<>();
    for (Map.Entry<String, String> userEntry : users.entrySet()) {
        String name = userEntry.getKey();
        String item = userEntry.getValue();
        inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
    }
    DataSetManager<Table> inputManager = getDataset(inputName);
    MockSource.writeInput(inputManager, inputRecords);
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${dir} macro will be substituted with "users" for our pipeline run
    runtimeArgs.put("dir", outputDirName);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<FileSet> outputManager = getDataset(outputName);
    FileSet output = outputManager.get();
    Location outputDir = output.getBaseLocation().append(outputDirName);
    Map<String, String> actual = new HashMap<>();
    for (Location outputFile : outputDir.list()) {
        if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
            Assert.fail("Post action did not delete file " + outputFile.getName());
        }
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\\|");
                actual.put(parts[0], parts[1]);
            }
        }
    }
    Assert.assertEquals(actual, users);
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashMap(java.util.HashMap) Map(java.util.Map) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 43 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class PipelineTest method testTextFileSourceAndMoveAction.

@Test
public void testTextFileSourceAndMoveAction() throws Exception {
    // create the pipeline config
    String moveFromName = "sourceTestMoveFrom";
    String inputName = "sourceTestInput";
    String outputName = "sourceTestOutput";
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetMoveAction.Conf.SOURCE_FILESET, "sourceTestMoveFrom");
    actionProperties.put(FilesetMoveAction.Conf.DEST_FILESET, inputName);
    ETLStage moveAction = new ETLStage("moveInput", new ETLPlugin(FilesetMoveAction.NAME, Action.PLUGIN_TYPE, actionProperties, null));
    Map<String, String> sourceProperties = new HashMap<>();
    sourceProperties.put(TextFileSetSource.Conf.FILESET_NAME, inputName);
    sourceProperties.put(TextFileSetSource.Conf.CREATE_IF_NOT_EXISTS, "true");
    sourceProperties.put(TextFileSetSource.Conf.DELETE_INPUT_ON_SUCCESS, "true");
    sourceProperties.put(TextFileSetSource.Conf.FILES, "${file}");
    ETLStage source = new ETLStage("source", new ETLPlugin(TextFileSetSource.NAME, BatchSource.PLUGIN_TYPE, sourceProperties, null));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(moveAction).addConnection(moveAction.getName(), source.getName()).addConnection(source.getName(), sink.getName()).build();
    // create the move from fileset
    addDatasetInstance(FileSet.class.getName(), moveFromName);
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSourceTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some files that will be moved to the input fileset
    DataSetManager<FileSet> moveFromManager = getDataset(moveFromName);
    // this file starts with '.' and should be ignored.
    Location invisibleFile = moveFromManager.get().getBaseLocation().append(".hidden");
    try (OutputStream outputStream = invisibleFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes("this should not be read"));
    }
    // this file should be moved
    String line1 = "Hello World!";
    String line2 = "Good to meet you";
    String line3 = "My name is Hal";
    String inputText = line1 + "\n" + line2 + "\n" + line3;
    Location inputFile = moveFromManager.get().getBaseLocation().append("inputFile");
    try (OutputStream outputStream = inputFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes(inputText));
    }
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${file} macro will be substituted with "inputFile" for our pipeline run
    runtimeArgs.put("file", "inputFile");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<Table> outputManager = getDataset(outputName);
    Set<StructuredRecord> outputRecords = new HashSet<>();
    outputRecords.addAll(MockSink.readOutput(outputManager));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line1)).set("text", line1).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line2)).set("text", line2).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line3)).set("text", line3).build());
    Assert.assertEquals(expected, outputRecords);
    // check that the input file does not exist in the moveFrom fileSet,
    // and was deleted by the source in the input fileSet
    Assert.assertFalse(moveFromManager.get().getBaseLocation().append("inputFile").exists());
    DataSetManager<FileSet> inputManager = getDataset(inputName);
    Assert.assertFalse(inputManager.get().getBaseLocation().append("inputFile").exists());
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) OutputStream(java.io.OutputStream) WorkflowManager(co.cask.cdap.test.WorkflowManager) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Location(org.apache.twill.filesystem.Location) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 44 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        sparkConf.set(property.getKey(), property.getValue());
    }
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    context.setSparkConf(sparkConf);
    if (!spec.isCheckpointsDisabled()) {
        // Each pipeline has its own checkpoint directory within the checkpoint fileset.
        // Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
        // This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
        // Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
        // and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
        // On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
        FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
        String pipelineName = context.getApplicationSpecification().getName();
        String checkpointDir = spec.getCheckpointDirectory();
        Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
        Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
        if (!ensureDirExists(pipelineCheckpointBase)) {
            throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
        }
        try {
            for (Location child : pipelineCheckpointBase.list()) {
                if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
                    LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
                }
            }
        } catch (Exception e) {
            LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
        }
        if (!ensureDirExists(pipelineCheckpointDir)) {
            throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
        }
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) IOException(java.io.IOException) IOException(java.io.IOException) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) Location(org.apache.twill.filesystem.Location) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Example 45 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkStreamingPipelineDriver method run.

@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
    final DataStreamsPipelineSpec pipelineSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    final PipelinePhase pipelinePhase = PipelinePhase.builder(SUPPORTED_PLUGIN_TYPES).addConnections(pipelineSpec.getConnections()).addStages(pipelineSpec.getStages()).build();
    boolean checkpointsDisabled = pipelineSpec.isCheckpointsDisabled();
    String checkpointDir = null;
    if (!checkpointsDisabled) {
        // Get the location of the checkpoint directory.
        String pipelineName = sec.getApplicationSpecification().getName();
        String relativeCheckpointDir = pipelineSpec.getCheckpointDirectory();
        // there isn't any way to instantiate the fileset except in a TxRunnable, so need to use a reference.
        final AtomicReference<Location> checkpointBaseRef = new AtomicReference<>();
        Transactionals.execute(sec, new TxRunnable() {

            @Override
            public void run(DatasetContext context) throws Exception {
                FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
                checkpointBaseRef.set(checkpointFileSet.getBaseLocation());
            }
        }, Exception.class);
        Location pipelineCheckpointDir = checkpointBaseRef.get().append(pipelineName).append(relativeCheckpointDir);
        checkpointDir = pipelineCheckpointDir.toURI().toString();
    }
    JavaStreamingContext jssc = run(pipelineSpec, pipelinePhase, sec, checkpointDir);
    jssc.start();
    boolean stopped = false;
    try {
        // most programs will just keep running forever.
        // however, when CDAP stops the program, we get an interrupted exception.
        // at that point, we need to call stop on jssc, otherwise the program will hang and never stop.
        stopped = jssc.awaitTerminationOrTimeout(Long.MAX_VALUE);
    } finally {
        if (!stopped) {
            jssc.stop(true, pipelineSpec.isStopGracefully());
        }
    }
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) AtomicReference(java.util.concurrent.atomic.AtomicReference) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) TxRunnable(co.cask.cdap.api.TxRunnable) DatasetContext(co.cask.cdap.api.data.DatasetContext) Location(org.apache.twill.filesystem.Location)

Aggregations

FileSet (co.cask.cdap.api.dataset.lib.FileSet)45 Location (org.apache.twill.filesystem.Location)32 Test (org.junit.Test)23 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)13 HashMap (java.util.HashMap)13 ApplicationManager (co.cask.cdap.test.ApplicationManager)10 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)9 DatasetId (co.cask.cdap.proto.id.DatasetId)9 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 SparkManager (co.cask.cdap.test.SparkManager)8 ColumnDesc (co.cask.cdap.proto.ColumnDesc)6 QueryResult (co.cask.cdap.proto.QueryResult)6 WorkflowManager (co.cask.cdap.test.WorkflowManager)5 File (java.io.File)5 Table (co.cask.cdap.api.dataset.table.Table)4 ServiceManager (co.cask.cdap.test.ServiceManager)4 IOException (java.io.IOException)4 IdentityHashMap (java.util.IdentityHashMap)4 Map (java.util.Map)4 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3