use of org.apache.spark.api.java.JavaPairRDD in project beam by apache.
the class SparkCompat method extractOutput.
/**
* Extracts the output for a given collection of WindowedAccumulators.
*
* <p>This is required because the API of JavaPairRDD.flatMapValues is different among Spark
* versions. See https://issues.apache.org/jira/browse/SPARK-19287
*/
public static <K, InputT, AccumT, OutputT> JavaPairRDD<K, WindowedValue<OutputT>> extractOutput(JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>> accumulatePerKey, SparkCombineFn<KV<K, InputT>, InputT, AccumT, OutputT> sparkCombineFn) {
try {
if (accumulatePerKey.context().version().startsWith("3")) {
FlatMapFunction<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, WindowedValue<OutputT>> flatMapFunction = (FlatMapFunction<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, WindowedValue<OutputT>>) windowedAccumulator -> sparkCombineFn.extractOutputStream(windowedAccumulator).iterator();
// This invokes by reflection the equivalent of:
// return accumulatePerKey.flatMapValues(flatMapFunction);
Method method = accumulatePerKey.getClass().getDeclaredMethod("flatMapValues", FlatMapFunction.class);
Object result = method.invoke(accumulatePerKey, flatMapFunction);
return (JavaPairRDD<K, WindowedValue<OutputT>>) result;
}
Function<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, Iterable<WindowedValue<OutputT>>> flatMapFunction = windowedAccumulator -> sparkCombineFn.extractOutputStream(windowedAccumulator).collect(Collectors.toList());
// This invokes by reflection the equivalent of:
// return accumulatePerKey.flatMapValues(flatMapFunction);
Method method = accumulatePerKey.getClass().getDeclaredMethod("flatMapValues", Function.class);
Object result = method.invoke(accumulatePerKey, flatMapFunction);
return (JavaPairRDD<K, WindowedValue<OutputT>>) result;
} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
throw new RuntimeException("Error invoking Spark flatMapValues", e);
}
}
use of org.apache.spark.api.java.JavaPairRDD in project hive by apache.
the class SparkPlanGenerator method generateMapInput.
@SuppressWarnings("unchecked")
private MapInput generateMapInput(SparkPlan sparkPlan, MapWork mapWork) throws Exception {
JobConf jobConf = cloneJobConf(mapWork);
Class ifClass = getInputFormat(jobConf, mapWork);
sc.sc().setCallSite(CallSite.apply(mapWork.getName(), ""));
JavaPairRDD<WritableComparable, Writable> hadoopRDD;
if (mapWork.getNumMapTasks() != null) {
jobConf.setNumMapTasks(mapWork.getNumMapTasks());
hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class, mapWork.getNumMapTasks());
} else {
hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class);
}
boolean toCache = false;
String tables = mapWork.getAllRootOperators().stream().filter(op -> op instanceof TableScanOperator).map(ts -> ((TableScanDesc) ts.getConf()).getAlias()).collect(Collectors.joining(", "));
String rddName = mapWork.getName() + " (" + tables + ", " + hadoopRDD.getNumPartitions() + (toCache ? ", cached)" : ")");
// Caching is disabled for MapInput due to HIVE-8920
MapInput result = new MapInput(sparkPlan, hadoopRDD, toCache, rddName, mapWork);
return result;
}
use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.
the class MLContextConversionUtil method matrixObjectToBinaryBlockMatrix.
/**
* Convert a {@code MatrixObject} to a {@code BinaryBlockMatrix}.
*
* @param matrixObject
* the {@code MatrixObject}
* @param sparkExecutionContext
* the Spark execution context
* @return the {@code MatrixObject} converted to a {@code BinaryBlockMatrix}
*/
public static BinaryBlockMatrix matrixObjectToBinaryBlockMatrix(MatrixObject matrixObject, SparkExecutionContext sparkExecutionContext) {
try {
@SuppressWarnings("unchecked") JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlock = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sparkExecutionContext.getRDDHandleForMatrixObject(matrixObject, InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics matrixCharacteristics = matrixObject.getMatrixCharacteristics();
return new BinaryBlockMatrix(binaryBlock, matrixCharacteristics);
} catch (DMLRuntimeException e) {
throw new MLContextException("DMLRuntimeException while converting matrix object to BinaryBlockMatrix", e);
}
}
use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.
the class ResultMergeRemoteSpark method executeMerge.
@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, String varname, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
String jobname = "ParFor-RMSP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) _ec;
boolean withCompare = (compare != null);
RDDObject ret = null;
//determine degree of parallelism
int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
//sanity check for empty src files
if (inputs == null || inputs.length == 0)
throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
try {
//note: initial implementation via union over all result rdds discarded due to
//stack overflow errors with many parfor tasks, and thus many rdds
//Step 1: construct input rdd from all result files of parfor workers
//a) construct job conf with all files
InputInfo ii = InputInfo.BinaryBlockInputInfo;
JobConf job = new JobConf(ResultMergeRemoteMR.class);
job.setJobName(jobname);
job.setInputFormat(ii.inputFormatClass);
Path[] paths = new Path[inputs.length];
for (int i = 0; i < paths.length; i++) {
//ensure input exists on hdfs (e.g., if in-memory or RDD)
inputs[i].exportData();
paths[i] = new Path(inputs[i].getFileName());
//update rdd handle to allow lazy evaluation by guarding
//against cleanup of temporary result files
setRDDHandleForMerge(inputs[i], sec);
}
FileInputFormat.setInputPaths(job, paths);
//b) create rdd from input files w/ deep copy of keys and blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
//Step 2a: merge with compare
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
if (withCompare) {
JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
//merge values which differ from compare values
ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare();
out = //group all result blocks per key
rdd.groupByKey(numRed).join(//join compare block and result blocks
compareRdd).mapToPair(//merge result blocks w/ compare
cfun);
} else //Step 2b: merge without compare
{
//direct merge in any order (disjointness guaranteed)
out = RDDAggregateUtils.mergeByKey(rdd, false);
}
//Step 3: create output rdd handle w/ lineage
ret = new RDDObject(out, varname);
for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
if (withCompare)
ret.addLineageChild(compare.getRDDHandle());
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
//maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
return ret;
}
use of org.apache.spark.api.java.JavaPairRDD in project tdi-studio-se by Talend.
the class TalendDStreamPairRDD method saveAsHadoopDataset.
@Override
public void saveAsHadoopDataset(JobConf conf) {
final JobConf config = conf;
this.rdd.foreachRDD(new Function<JavaPairRDD<K, V>, Void>() {
private static final long serialVersionUID = 1L;
public Void call(JavaPairRDD<K, V> v1) throws Exception {
v1.saveAsHadoopDataset(config);
return null;
}
});
}
Aggregations