Search in sources :

Example 1 with OutputFormatProvider

use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.

the class ExternalDatasets method makeTrackable.

/**
 * If the output is an external sink then an external dataset is created for tracking purpose and returned.
 * If the output is a regular dataset then it is already trackable, hence same output is returned.
 *
 * @param admin {@link Admin} used to create external dataset
 * @param output output to be tracked
 * @return an external dataset if output is an external sink, otherwise the same output is returned
 */
public static Output makeTrackable(Admin admin, Output output) {
    // If output is not an external sink, return the same output as it can be tracked by itself.
    if (!(output instanceof Output.OutputFormatProviderOutput)) {
        return output;
    }
    // Output is an external sink, create an external dataset so that it can be tracked.
    String outputName = output.getName();
    OutputFormatProvider outputFormatProvider = ((Output.OutputFormatProviderOutput) output).getOutputFormatProvider();
    Map<String, String> outputFormatConfiguration = outputFormatProvider.getOutputFormatConfiguration();
    // this can be tracked by itself without creating an external dataset
    if (outputFormatProvider instanceof Dataset) {
        return output;
    }
    // Output is an external sink, create an external dataset so that it can be tracked.
    try {
        // Create an external dataset for the output format for lineage tracking
        Map<String, String> arguments = new HashMap<>();
        arguments.put("output.format.class", outputFormatProvider.getOutputFormatClassName());
        arguments.putAll(outputFormatConfiguration);
        if (!admin.datasetExists(outputName)) {
            // Note: the dataset properties are the same as the arguments since we cannot identify them separately
            // since they are mixed up in a single configuration object (CDAP-5674)
            // Also, the properties of the external dataset created will contain runtime arguments for the same reason.
            admin.createDataset(outputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
        } else {
            // Check if the external dataset name clashes with an existing CDAP Dataset
            String datasetType = admin.getDatasetType(outputName);
            if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
                throw new IllegalArgumentException("An external sink cannot have the same name as an existing CDAP Dataset instance " + outputName);
            }
        }
        return Output.ofDataset(outputName, Collections.unmodifiableMap(arguments)).alias(output.getAlias());
    } catch (DatasetManagementException e) {
        throw Throwables.propagate(e);
    }
}
Also used : DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) HashMap(java.util.HashMap) Dataset(io.cdap.cdap.api.dataset.Dataset) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider)

Example 2 with OutputFormatProvider

use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.

the class MultiOutputFormat method addOutputs.

public static void addOutputs(Configuration hConf, Map<String, OutputFormatProvider> outputs, Map<String, Set<String>> sinkOutputs) {
    hConf.set(NAMES, GSON.toJson(outputs.keySet()));
    hConf.set(SINK_OUTPUTS, GSON.toJson(sinkOutputs));
    for (Map.Entry<String, OutputFormatProvider> entry : outputs.entrySet()) {
        OutputFormatProvider outputFormatProvider = entry.getValue();
        hConf.set(getClassNameKey(entry.getKey()), outputFormatProvider.getOutputFormatClassName());
        hConf.set(getPropertiesKey(entry.getKey()), GSON.toJson(outputFormatProvider.getOutputFormatConfiguration()));
    }
}
Also used : HashMap(java.util.HashMap) Map(java.util.Map) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider)

Example 3 with OutputFormatProvider

use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.

the class BasicMapReduceContext method addOutput.

@Override
public void addOutput(Output output) {
    if (output.getNamespace() != null && output.getNamespace().equals(NamespaceId.SYSTEM.getNamespace()) && !getProgram().getNamespaceId().equals(NamespaceId.SYSTEM.getNamespace())) {
        // trying to access system namespace from a program outside system namespace is not allowed
        throw new IllegalArgumentException(String.format("Accessing Output %s in system namespace " + "is not allowed from the namespace %s", output.getName(), getProgram().getNamespaceId()));
    }
    String alias = output.getAlias();
    if (this.outputs.containsKey(alias)) {
        throw new IllegalArgumentException("Output already configured: " + alias);
    }
    ProvidedOutput providedOutput;
    if (output instanceof Output.DatasetOutput) {
        providedOutput = Outputs.transform((Output.DatasetOutput) output, this);
    } else if (output instanceof Output.OutputFormatProviderOutput) {
        OutputFormatProvider outputFormatProvider = ((Output.OutputFormatProviderOutput) output).getOutputFormatProvider();
        if (outputFormatProvider instanceof DatasetOutputCommitter) {
            // be able to call its methods in MainOutputCommitter. It needs to be a DatasetOutput.
            throw new IllegalArgumentException("Cannot add a DatasetOutputCommitter as an OutputFormatProviderOutput. " + "Add the output as a DatasetOutput.");
        }
        providedOutput = new ProvidedOutput(output, outputFormatProvider);
    } else if (output.getClass().getCanonicalName().startsWith(CDAP_PACKAGE_PREFIX)) {
        // Skip unsupported outputs from within CDAP packages.
        // This is used to ignore unsupported outputs in MapReduce (such as the SQL Engine Output for Spark).
        LOG.info("Unsupported output in MapReduce: {}", output.getClass().getCanonicalName());
        return;
    } else {
        // shouldn't happen unless user defines their own Output class
        throw new IllegalArgumentException(String.format("Output %s has unknown output class %s", output.getName(), output.getClass().getCanonicalName()));
    }
    this.outputs.put(alias, providedOutput);
}
Also used : ProvidedOutput(io.cdap.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) Output(io.cdap.cdap.api.data.batch.Output) ProvidedOutput(io.cdap.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) DatasetOutputCommitter(io.cdap.cdap.api.data.batch.DatasetOutputCommitter) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider)

Example 4 with OutputFormatProvider

use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.

the class SparkBatchSinkFactory method writeCombinedRDD.

/**
 * Writes a combined RDD using multiple OutputFormatProviders.
 * Returns the set of output names that were written, which still require dataset lineage to be recorded.
 */
public <K, V> Set<String> writeCombinedRDD(JavaPairRDD<String, KeyValue<K, V>> combinedRDD, JavaSparkExecutionContext sec, Set<String> sinkNames) {
    Map<String, OutputFormatProvider> outputFormatProviders = new HashMap<>();
    Set<String> lineageNames = new HashSet<>();
    for (String sinkName : sinkNames) {
        Set<String> sinkOutputNames = sinkOutputs.get(sinkName);
        if (sinkOutputNames == null || sinkOutputNames.isEmpty()) {
            // should never happen if validation happened correctly at pipeline configure time
            throw new IllegalStateException(sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
        }
        for (String outputName : sinkOutputNames) {
            NamedOutputFormatProvider outputFormatProvider = this.outputFormatProviders.get(outputName);
            if (outputFormatProvider == null) {
                // Check if this is a SQL engine output. If this is the case, skip this output.
                SQLEngineOutput sqlEngineOutput = sqlOutputs.get(outputName);
                if (sqlEngineOutput != null) {
                    continue;
                }
                // grouped with other sinks
                throw new IllegalStateException(String.format("sink '%s' does not use an OutputFormatProvider. " + "This indicates that there is a planner bug. " + "Please report the issue and turn off stage consolidation by setting '%s'" + " to false in the runtime arguments.", sinkName, Constants.CONSOLIDATE_STAGES));
            }
            lineageNames.add(outputFormatProvider.name);
            outputFormatProviders.put(outputName, outputFormatProvider);
        }
    }
    Configuration hConf = new Configuration();
    Map<String, Set<String>> groupSinkOutputs = new HashMap<>();
    for (String sink : sinkNames) {
        Set<String> outputFormatProvidersForSink = sinkOutputs.get(sink).stream().filter(outputFormatProviders::containsKey).collect(Collectors.toSet());
        if (!outputFormatProvidersForSink.isEmpty()) {
            groupSinkOutputs.put(sink, outputFormatProvidersForSink);
        }
    }
    MultiOutputFormat.addOutputs(hConf, outputFormatProviders, groupSinkOutputs);
    hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, MultiOutputFormat.class.getName());
    RDDUtils.saveHadoopDataset(combinedRDD, hConf);
    return lineageNames;
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) MultiOutputFormat(io.cdap.cdap.etl.common.output.MultiOutputFormat) SQLEngineOutput(io.cdap.cdap.etl.api.engine.sql.SQLEngineOutput) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider) HashSet(java.util.HashSet)

Example 5 with OutputFormatProvider

use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.

the class MockExternalSink method prepareRun.

@Override
public void prepareRun(BatchSinkContext context) {
    OutputFormatProvider outputFormatProvider = new Provider(config.dirName);
    if (config.name != null) {
        Output output = Output.of(config.name, outputFormatProvider);
        output.alias(config.alias);
        context.addOutput(output);
    } else {
        context.addOutput(Output.of(config.alias, outputFormatProvider));
    }
    if (config.name2 != null) {
        context.addOutput(Output.of(config.name2, new Provider(config.dirName2)).alias(config.alias2));
    } else if (config.alias2 != null) {
        context.addOutput(Output.of(config.alias2, new Provider(config.dirName2)));
    }
}
Also used : Output(io.cdap.cdap.api.data.batch.Output) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider)

Aggregations

OutputFormatProvider (io.cdap.cdap.api.data.batch.OutputFormatProvider)8 HashMap (java.util.HashMap)5 Output (io.cdap.cdap.api.data.batch.Output)3 MultiOutputFormat (io.cdap.cdap.etl.common.output.MultiOutputFormat)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 Set (java.util.Set)2 Configuration (org.apache.hadoop.conf.Configuration)2 DatasetOutputCommitter (io.cdap.cdap.api.data.batch.DatasetOutputCommitter)1 Dataset (io.cdap.cdap.api.dataset.Dataset)1 DatasetManagementException (io.cdap.cdap.api.dataset.DatasetManagementException)1 KeyValue (io.cdap.cdap.api.dataset.lib.KeyValue)1 SQLEngineOutput (io.cdap.cdap.etl.api.engine.sql.SQLEngineOutput)1 BasicOutputFormatProvider (io.cdap.cdap.etl.batch.BasicOutputFormatProvider)1 NullOutputFormatProvider (io.cdap.cdap.etl.batch.preview.NullOutputFormatProvider)1 BasicOutputFormatProvider (io.cdap.cdap.internal.app.runtime.batch.BasicOutputFormatProvider)1 ProvidedOutput (io.cdap.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput)1 LinkedHashMap (java.util.LinkedHashMap)1