Examples with FileSet - io.cdap.cdap.api.dataset.lib.FileSet

Example 6 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class ExploreTableManager method generateDisableStatement.

private String generateDisableStatement(DatasetId datasetId, DatasetSpecification spec) throws ExploreException {
    String tableName = tableNaming.getTableName(datasetId, spec.getProperties());
    String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
    // If table does not exist, nothing to be done
    try {
        exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
    } catch (TableNotFoundException e) {
        // Ignore exception, since this means table was not found.
        return null;
    }
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        Dataset dataset = datasetInstantiator.getDataset(datasetId);
        try {
            if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
                // do not drop the explore table that dataset is reusing an existing table
                if (FileSetProperties.isUseExisting(spec.getProperties())) {
                    return null;
                }
            }
            return generateDeleteStatement(dataset, databaseName, tableName);
        } finally {
            Closeables.closeQuietly(dataset);
        }
    } catch (IOException e) {
        LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetId, e);
        throw new ExploreException("Exception instantiating dataset " + datasetId);
    }
}

Also used : PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) SystemDatasetInstantiator(io.cdap.cdap.data.dataset.SystemDatasetInstantiator) Dataset(io.cdap.cdap.api.dataset.Dataset) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException)

Example 7 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class MapReduceProgramRunnerTest method testMapreduceWithFile.

private void testMapreduceWithFile(String inputDatasetName, String inputPaths, String outputDatasetName, String outputPath, Class appClass, Class mrClass, Map<String, String> extraRuntimeArgs, @Nullable final String counterTableName, @Nullable final String outputSeparator) throws Exception {
    final ApplicationWithPrograms app = deployApp(appClass, new AppWithMapReduceUsingFileSet.AppConfig(inputDatasetName, outputDatasetName));
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    FileSetArguments.setInputPaths(inputArgs, inputPaths);
    FileSetArguments.setOutputPath(outputArgs, outputPath);
    if (outputSeparator != null) {
        outputArgs.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + TextOutputFormat.SEPERATOR, "#");
    }
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, inputDatasetName, inputArgs));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, outputDatasetName, outputArgs));
    if (extraRuntimeArgs != null) {
        runtimeArguments.putAll(extraRuntimeArgs);
    }
    // clear the counters in case a previous test case left behind some values
    if (counterTableName != null) {
        Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", () -> {
            KeyValueTable counters = datasetCache.getDataset(counterTableName);
            counters.delete(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS);
            counters.delete(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS);
        });
    }
    // write a handful of numbers to a file; compute their sum, too.
    final long[] values = { 15L, 17L, 7L, 3L };
    final FileSet input = datasetCache.getDataset(inputDatasetName, inputArgs);
    long sum = 0L, count = 1;
    long inputRecords = 0;
    for (Location inputLocation : input.getInputLocations()) {
        final PrintWriter writer = new PrintWriter(inputLocation.getOutputStream());
        for (long value : values) {
            value *= count;
            writer.println(value);
            sum += value;
            inputRecords++;
        }
        writer.close();
        count++;
    }
    runProgram(app, mrClass, new BasicArguments(runtimeArguments));
    // output location in file system is a directory that contains a part file, a _SUCCESS file, and checksums
    // (.<filename>.crc) for these files. Find the actual part file. Its name begins with "part". In this case,
    // there should be only one part file (with this small data, we have a single reducer).
    final FileSet results = datasetCache.getDataset(outputDatasetName, outputArgs);
    Location resultLocation = results.getOutputLocation();
    if (resultLocation.isDirectory()) {
        for (Location child : resultLocation.list()) {
            if (!child.isDirectory() && child.getName().startsWith("part")) {
                resultLocation = child;
                break;
            }
        }
    }
    Assert.assertFalse(resultLocation.isDirectory());
    // read output and verify result
    String line = CharStreams.readFirstLine(CharStreams.newReaderSupplier(Locations.newInputSupplier(resultLocation), Charsets.UTF_8));
    Assert.assertNotNull(line);
    String[] fields = line.split(outputSeparator == null ? ":" : outputSeparator);
    Assert.assertEquals(2, fields.length);
    Assert.assertEquals(AppWithMapReduceUsingFileSet.FileMapper.ONLY_KEY, fields[0]);
    Assert.assertEquals(sum, Long.parseLong(fields[1]));
    if (counterTableName != null) {
        final long totalInputRecords = inputRecords;
        Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", () -> {
            KeyValueTable counters = datasetCache.getDataset(counterTableName);
            Assert.assertEquals(totalInputRecords, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS, 0L));
            Assert.assertEquals(1L, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS, 0L));
        });
    }
}

Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) ApplicationWithPrograms(io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) Location(org.apache.twill.filesystem.Location) PrintWriter(java.io.PrintWriter)

Example 8 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class MapReduceWithMultipleOutputsTest method testMultipleOutputs.

@Test
public void testMultipleOutputs() throws Exception {
    ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingMultipleOutputs.class);
    final FileSet fileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.PURCHASES);
    Location inputFile = fileSet.getBaseLocation().append("inputFile");
    inputFile.createNew();
    PrintWriter writer = new PrintWriter(inputFile.getOutputStream());
    // the PURCHASES dataset consists of purchase records in the format: <customerId> <spend>
    writer.println("1 20");
    writer.println("1 65");
    writer.println("1 30");
    writer.println("2 5");
    writer.println("2 53");
    writer.println("2 45");
    writer.println("3 101");
    writer.close();
    // Using multiple outputs, this MapReduce send the records to a different path of the same dataset, depending
    // on the value in the data (large spend amounts will go to one file, while small will go to another file.
    runProgram(app, AppWithMapReduceUsingMultipleOutputs.SeparatePurchases.class, new BasicArguments());
    FileSet outputFileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.SEPARATED_PURCHASES);
    Assert.assertEquals(ImmutableList.of("1 20", "1 30", "2 5", "2 45"), readFromOutput(outputFileSet, "small_purchases"));
    Assert.assertEquals(ImmutableList.of("1 65", "2 53", "3 101"), readFromOutput(outputFileSet, "large_purchases"));
}

Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) ApplicationWithPrograms(io.cdap.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) Location(org.apache.twill.filesystem.Location) PrintWriter(java.io.PrintWriter) Test(org.junit.Test)

Example 9 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class ReportGenerationAppTest method createAndInitializeDataset.

private DatasetId createAndInitializeDataset(NamespaceId namespaceId, long currentTimeMillis) throws Exception {
    DatasetId metaFileset = namespaceId.dataset(ReportGenerationApp.RUN_META_FILESET);
    addDatasetInstance(metaFileset, FileSet.class.getName());
    // TODO: [CDAP-13216] temporarily create the run meta fileset and generate mock program run meta files here.
    // Will remove once the TMS subscriber writing to the run meta fileset is implemented.
    DataSetManager<FileSet> fileSet = getDataset(metaFileset);
    populateMetaFiles(fileSet.get().getBaseLocation(), currentTimeMillis);
    return metaFileset;
}

Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Example 10 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SmartWorkflow method destroy.

@Override
public void destroy() {
    WorkflowContext workflowContext = getContext();
    PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
    // Execute the post actions only if pipeline is not running in preview mode.
    if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
        for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
            String name = endingActionEntry.getKey();
            PostAction action = endingActionEntry.getValue();
            StageSpec stageSpec = stageSpecs.get(name);
            BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
            try {
                action.run(context);
            } catch (Throwable t) {
                LOG.error("Error while running post action {}.", name, t);
            }
        }
    }
    Map<String, String> connectorDatasets = GSON.fromJson(workflowContext.getWorkflowSpecification().getProperty(Constants.CONNECTOR_DATASETS), STAGE_DATASET_MAP);
    // publish all alerts
    for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
        String stageName = alertPublisherEntry.getKey();
        AlertPublisher alertPublisher = alertPublisherEntry.getValue();
        FileSet alertConnector = workflowContext.getDataset(connectorDatasets.get(stageName));
        try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector)) {
            if (!alerts.hasNext()) {
                continue;
            }
            StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, stageName);
            StageSpec stageSpec = stageSpecs.get(stageName);
            AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
            alertPublisher.initialize(alertContext);
            TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
            alertPublisher.publish(trackedIterator);
        } catch (Exception e) {
            LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", stageName, e);
        } finally {
            try {
                alertPublisher.destroy();
            } catch (Exception e) {
                LOG.warn("Error destroying alert publisher for stage {}", stageName, e);
            }
        }
    }
    ProgramStatus status = getContext().getState().getStatus();
    if (status == ProgramStatus.FAILED) {
        WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
    } else {
        WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
    }
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext, workflowContext.getNamespace());
    // Get resolved plugin properties
    Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
    for (StageSpec spec : stageSpecs.values()) {
        String stageName = spec.getName();
        resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
    }
    // Add resolved plugin properties to workflow token as a JSON String
    workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
    // record only if the Workflow is successful
    if (status != ProgramStatus.COMPLETED) {
        return;
    }
    // Collect field operations from each phase
    WorkflowToken token = workflowContext.getToken();
    List<NodeValue> allNodeValues = token.getAll(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN);
    if (allNodeValues.isEmpty()) {
        // no field lineage recorded by any stage
        return;
    }
    Map<String, List<FieldOperation>> allStageOperations = new HashMap<>();
    for (StageSpec stageSpec : stageSpecs.values()) {
        allStageOperations.put(stageSpec.getName(), new ArrayList<>());
    }
    for (NodeValue nodeValue : allNodeValues) {
        Map<String, List<FieldOperation>> stageOperations = GSON.fromJson(nodeValue.getValue().toString(), STAGE_OPERATIONS_MAP);
        for (Map.Entry<String, List<FieldOperation>> entry : stageOperations.entrySet()) {
            // ignore them
            if (allStageOperations.containsKey(entry.getKey())) {
                allStageOperations.get(entry.getKey()).addAll(entry.getValue());
            }
        }
    }
    FieldLineageProcessor processor = new FieldLineageProcessor(spec);
    Set<Operation> processedOperations = processor.validateAndConvert(allStageOperations);
    if (!processedOperations.isEmpty()) {
        workflowContext.record(processedOperations);
    }
}

Also used : NodeValue(io.cdap.cdap.api.workflow.NodeValue) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) BatchActionContext(io.cdap.cdap.etl.api.batch.BatchActionContext) WorkflowBackedActionContext(io.cdap.cdap.etl.batch.WorkflowBackedActionContext) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) AlertReader(io.cdap.cdap.etl.batch.connector.AlertReader) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) AlertPublisherContext(io.cdap.cdap.etl.api.AlertPublisherContext) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) FieldLineageProcessor(io.cdap.cdap.etl.lineage.FieldLineageProcessor) AlertPublisher(io.cdap.cdap.etl.api.AlertPublisher) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) TrackedIterator(io.cdap.cdap.etl.common.TrackedIterator) WorkflowContext(io.cdap.cdap.api.workflow.WorkflowContext) DisjointConnectionsException(io.cdap.cdap.etl.planner.DisjointConnectionsException) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) Alert(io.cdap.cdap.etl.api.Alert) PostAction(io.cdap.cdap.etl.api.batch.PostAction) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) ProgramStatus(io.cdap.cdap.api.ProgramStatus)

Aggregations

FileSet (io.cdap.cdap.api.dataset.lib.FileSet)90 Location (org.apache.twill.filesystem.Location)56 Test (org.junit.Test)44 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)26 HashMap (java.util.HashMap)26 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)24 DatasetId (io.cdap.cdap.proto.id.DatasetId)22 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)16 ApplicationManager (io.cdap.cdap.test.ApplicationManager)16 Table (io.cdap.cdap.api.dataset.table.Table)14 WorkflowManager (io.cdap.cdap.test.WorkflowManager)14 ColumnDesc (io.cdap.cdap.proto.ColumnDesc)12 QueryResult (io.cdap.cdap.proto.QueryResult)12 SparkManager (io.cdap.cdap.test.SparkManager)12 File (java.io.File)10 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)8 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)8 IOException (java.io.IOException)8 PrintStream (java.io.PrintStream)8 PrintWriter (java.io.PrintWriter)8