use of io.cdap.cdap.etl.common.output.MultiOutputFormat in project cdap by caskdata.
the class SparkBatchSinkFactory method writeFromRDD.
/**
* Write the given RDD using one or more OutputFormats or CDAP datasets.
* Returns the names of the outputs written using OutputFormatProvider, which need to register lineage.
*/
public <K, V> Set<String> writeFromRDD(JavaPairRDD<K, V> rdd, JavaSparkExecutionContext sec, String sinkName) {
Set<String> outputNames = sinkOutputs.get(sinkName);
if (outputNames == null || outputNames.isEmpty()) {
// should never happen if validation happened correctly at pipeline configure time
throw new IllegalArgumentException(sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
}
Set<String> lineageNames = new HashSet<>();
Map<String, OutputFormatProvider> outputFormats = new HashMap<>();
for (String outputName : outputNames) {
NamedOutputFormatProvider outputFormatProvider = outputFormatProviders.get(outputName);
if (outputFormatProvider != null) {
outputFormats.put(outputName, outputFormatProvider);
lineageNames.add(outputFormatProvider.name);
}
DatasetInfo datasetInfo = datasetInfos.get(outputName);
if (datasetInfo != null) {
sec.saveAsDataset(rdd, datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
}
}
if (outputFormats.isEmpty()) {
return lineageNames;
}
if (outputFormats.size() == 1) {
RDDUtils.saveUsingOutputFormat(outputFormats.values().iterator().next(), rdd);
return lineageNames;
}
Configuration hConf = new Configuration();
Map<String, Set<String>> sinkOutputs = Collections.singletonMap(sinkName, outputFormats.keySet());
MultiOutputFormat.addOutputs(hConf, outputFormats, sinkOutputs);
hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, MultiOutputFormat.class.getName());
// MultiOutputFormat requires the key to be the sink name and the value to be the actual key-value to
// send to the delegate output format.
JavaPairRDD<String, KeyValue<K, V>> multiRDD = rdd.mapToPair(kv -> new Tuple2<>(sinkName, new KeyValue<>(kv._1(), kv._2())));
RDDUtils.saveHadoopDataset(multiRDD, hConf);
return lineageNames;
}
Aggregations