use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.
the class ExternalDatasets method makeTrackable.
/**
* If the output is an external sink then an external dataset is created for tracking purpose and returned.
* If the output is a regular dataset then it is already trackable, hence same output is returned.
*
* @param admin {@link Admin} used to create external dataset
* @param output output to be tracked
* @return an external dataset if output is an external sink, otherwise the same output is returned
*/
public static Output makeTrackable(Admin admin, Output output) {
// If output is not an external sink, return the same output as it can be tracked by itself.
if (!(output instanceof Output.OutputFormatProviderOutput)) {
return output;
}
// Output is an external sink, create an external dataset so that it can be tracked.
String outputName = output.getName();
OutputFormatProvider outputFormatProvider = ((Output.OutputFormatProviderOutput) output).getOutputFormatProvider();
Map<String, String> outputFormatConfiguration = outputFormatProvider.getOutputFormatConfiguration();
// this can be tracked by itself without creating an external dataset
if (outputFormatProvider instanceof Dataset) {
return output;
}
// Output is an external sink, create an external dataset so that it can be tracked.
try {
// Create an external dataset for the output format for lineage tracking
Map<String, String> arguments = new HashMap<>();
arguments.put("output.format.class", outputFormatProvider.getOutputFormatClassName());
arguments.putAll(outputFormatConfiguration);
if (!admin.datasetExists(outputName)) {
// Note: the dataset properties are the same as the arguments since we cannot identify them separately
// since they are mixed up in a single configuration object (CDAP-5674)
// Also, the properties of the external dataset created will contain runtime arguments for the same reason.
admin.createDataset(outputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
} else {
// Check if the external dataset name clashes with an existing CDAP Dataset
String datasetType = admin.getDatasetType(outputName);
if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
throw new IllegalArgumentException("An external sink cannot have the same name as an existing CDAP Dataset instance " + outputName);
}
}
return Output.ofDataset(outputName, Collections.unmodifiableMap(arguments)).alias(output.getAlias());
} catch (DatasetManagementException e) {
throw Throwables.propagate(e);
}
}
use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.
the class MultiOutputFormat method addOutputs.
public static void addOutputs(Configuration hConf, Map<String, OutputFormatProvider> outputs, Map<String, Set<String>> sinkOutputs) {
hConf.set(NAMES, GSON.toJson(outputs.keySet()));
hConf.set(SINK_OUTPUTS, GSON.toJson(sinkOutputs));
for (Map.Entry<String, OutputFormatProvider> entry : outputs.entrySet()) {
OutputFormatProvider outputFormatProvider = entry.getValue();
hConf.set(getClassNameKey(entry.getKey()), outputFormatProvider.getOutputFormatClassName());
hConf.set(getPropertiesKey(entry.getKey()), GSON.toJson(outputFormatProvider.getOutputFormatConfiguration()));
}
}
use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.
the class BasicMapReduceContext method addOutput.
@Override
public void addOutput(Output output) {
if (output.getNamespace() != null && output.getNamespace().equals(NamespaceId.SYSTEM.getNamespace()) && !getProgram().getNamespaceId().equals(NamespaceId.SYSTEM.getNamespace())) {
// trying to access system namespace from a program outside system namespace is not allowed
throw new IllegalArgumentException(String.format("Accessing Output %s in system namespace " + "is not allowed from the namespace %s", output.getName(), getProgram().getNamespaceId()));
}
String alias = output.getAlias();
if (this.outputs.containsKey(alias)) {
throw new IllegalArgumentException("Output already configured: " + alias);
}
ProvidedOutput providedOutput;
if (output instanceof Output.DatasetOutput) {
providedOutput = Outputs.transform((Output.DatasetOutput) output, this);
} else if (output instanceof Output.OutputFormatProviderOutput) {
OutputFormatProvider outputFormatProvider = ((Output.OutputFormatProviderOutput) output).getOutputFormatProvider();
if (outputFormatProvider instanceof DatasetOutputCommitter) {
// be able to call its methods in MainOutputCommitter. It needs to be a DatasetOutput.
throw new IllegalArgumentException("Cannot add a DatasetOutputCommitter as an OutputFormatProviderOutput. " + "Add the output as a DatasetOutput.");
}
providedOutput = new ProvidedOutput(output, outputFormatProvider);
} else if (output.getClass().getCanonicalName().startsWith(CDAP_PACKAGE_PREFIX)) {
// Skip unsupported outputs from within CDAP packages.
// This is used to ignore unsupported outputs in MapReduce (such as the SQL Engine Output for Spark).
LOG.info("Unsupported output in MapReduce: {}", output.getClass().getCanonicalName());
return;
} else {
// shouldn't happen unless user defines their own Output class
throw new IllegalArgumentException(String.format("Output %s has unknown output class %s", output.getName(), output.getClass().getCanonicalName()));
}
this.outputs.put(alias, providedOutput);
}
use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.
the class SparkBatchSinkFactory method writeCombinedRDD.
/**
* Writes a combined RDD using multiple OutputFormatProviders.
* Returns the set of output names that were written, which still require dataset lineage to be recorded.
*/
public <K, V> Set<String> writeCombinedRDD(JavaPairRDD<String, KeyValue<K, V>> combinedRDD, JavaSparkExecutionContext sec, Set<String> sinkNames) {
Map<String, OutputFormatProvider> outputFormatProviders = new HashMap<>();
Set<String> lineageNames = new HashSet<>();
for (String sinkName : sinkNames) {
Set<String> sinkOutputNames = sinkOutputs.get(sinkName);
if (sinkOutputNames == null || sinkOutputNames.isEmpty()) {
// should never happen if validation happened correctly at pipeline configure time
throw new IllegalStateException(sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
}
for (String outputName : sinkOutputNames) {
NamedOutputFormatProvider outputFormatProvider = this.outputFormatProviders.get(outputName);
if (outputFormatProvider == null) {
// Check if this is a SQL engine output. If this is the case, skip this output.
SQLEngineOutput sqlEngineOutput = sqlOutputs.get(outputName);
if (sqlEngineOutput != null) {
continue;
}
// grouped with other sinks
throw new IllegalStateException(String.format("sink '%s' does not use an OutputFormatProvider. " + "This indicates that there is a planner bug. " + "Please report the issue and turn off stage consolidation by setting '%s'" + " to false in the runtime arguments.", sinkName, Constants.CONSOLIDATE_STAGES));
}
lineageNames.add(outputFormatProvider.name);
outputFormatProviders.put(outputName, outputFormatProvider);
}
}
Configuration hConf = new Configuration();
Map<String, Set<String>> groupSinkOutputs = new HashMap<>();
for (String sink : sinkNames) {
Set<String> outputFormatProvidersForSink = sinkOutputs.get(sink).stream().filter(outputFormatProviders::containsKey).collect(Collectors.toSet());
if (!outputFormatProvidersForSink.isEmpty()) {
groupSinkOutputs.put(sink, outputFormatProvidersForSink);
}
}
MultiOutputFormat.addOutputs(hConf, outputFormatProviders, groupSinkOutputs);
hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, MultiOutputFormat.class.getName());
RDDUtils.saveHadoopDataset(combinedRDD, hConf);
return lineageNames;
}
use of io.cdap.cdap.api.data.batch.OutputFormatProvider in project cdap by caskdata.
the class MockExternalSink method prepareRun.
@Override
public void prepareRun(BatchSinkContext context) {
OutputFormatProvider outputFormatProvider = new Provider(config.dirName);
if (config.name != null) {
Output output = Output.of(config.name, outputFormatProvider);
output.alias(config.alias);
context.addOutput(output);
} else {
context.addOutput(Output.of(config.alias, outputFormatProvider));
}
if (config.name2 != null) {
context.addOutput(Output.of(config.name2, new Provider(config.dirName2)).alias(config.alias2));
} else if (config.alias2 != null) {
context.addOutput(Output.of(config.alias2, new Provider(config.dirName2)));
}
}
Aggregations