Search in sources :

Example 1 with MultiOutputFormat

use of io.cdap.cdap.etl.common.output.MultiOutputFormat in project cdap by caskdata.

the class SparkBatchSinkFactory method writeFromRDD.

/**
 * Write the given RDD using one or more OutputFormats or CDAP datasets.
 * Returns the names of the outputs written using OutputFormatProvider, which need to register lineage.
 */
public <K, V> Set<String> writeFromRDD(JavaPairRDD<K, V> rdd, JavaSparkExecutionContext sec, String sinkName) {
    Set<String> outputNames = sinkOutputs.get(sinkName);
    if (outputNames == null || outputNames.isEmpty()) {
        // should never happen if validation happened correctly at pipeline configure time
        throw new IllegalArgumentException(sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
    }
    Set<String> lineageNames = new HashSet<>();
    Map<String, OutputFormatProvider> outputFormats = new HashMap<>();
    for (String outputName : outputNames) {
        NamedOutputFormatProvider outputFormatProvider = outputFormatProviders.get(outputName);
        if (outputFormatProvider != null) {
            outputFormats.put(outputName, outputFormatProvider);
            lineageNames.add(outputFormatProvider.name);
        }
        DatasetInfo datasetInfo = datasetInfos.get(outputName);
        if (datasetInfo != null) {
            sec.saveAsDataset(rdd, datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
        }
    }
    if (outputFormats.isEmpty()) {
        return lineageNames;
    }
    if (outputFormats.size() == 1) {
        RDDUtils.saveUsingOutputFormat(outputFormats.values().iterator().next(), rdd);
        return lineageNames;
    }
    Configuration hConf = new Configuration();
    Map<String, Set<String>> sinkOutputs = Collections.singletonMap(sinkName, outputFormats.keySet());
    MultiOutputFormat.addOutputs(hConf, outputFormats, sinkOutputs);
    hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, MultiOutputFormat.class.getName());
    // MultiOutputFormat requires the key to be the sink name and the value to be the actual key-value to
    // send to the delegate output format.
    JavaPairRDD<String, KeyValue<K, V>> multiRDD = rdd.mapToPair(kv -> new Tuple2<>(sinkName, new KeyValue<>(kv._1(), kv._2())));
    RDDUtils.saveHadoopDataset(multiRDD, hConf);
    return lineageNames;
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) KeyValue(io.cdap.cdap.api.dataset.lib.KeyValue) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) MultiOutputFormat(io.cdap.cdap.etl.common.output.MultiOutputFormat) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider) HashSet(java.util.HashSet)

Aggregations

OutputFormatProvider (io.cdap.cdap.api.data.batch.OutputFormatProvider)1 KeyValue (io.cdap.cdap.api.dataset.lib.KeyValue)1 MultiOutputFormat (io.cdap.cdap.etl.common.output.MultiOutputFormat)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 Configuration (org.apache.hadoop.conf.Configuration)1