use of org.apache.spark.broadcast.Broadcast in project bunsen by cerner.
the class BroadcastableMappings method broadcast.
/**
* Broadcast mappings stored in the given conceptMaps instance that match the given
* conceptMapUris.
*
* @param conceptMaps the {@link ConceptMaps} instance with the content to broadcast
* @param conceptMapUriToVersion map of the concept map URIs to broadcast to their versions.
* @return a broadcast variable containing a mappings object usable in UDFs.
*/
public static Broadcast<BroadcastableMappings> broadcast(ConceptMaps conceptMaps, Map<String, String> conceptMapUriToVersion) {
Map<String, ConceptMap> mapsToLoad = conceptMaps.getMaps().collectAsList().stream().filter(conceptMap -> conceptMap.getVersion().equals(conceptMapUriToVersion.get(conceptMap.getUrl()))).collect(Collectors.toMap(ConceptMap::getUrl, Function.identity()));
// Expand the concept maps to load and sort them so dependencies are before
// their dependents in the list.
List<String> sortedMapsToLoad = sortMapsToLoad(conceptMapUriToVersion.keySet(), mapsToLoad);
// Since this is used to map from one system to another, we use only targets
// that don't introduce inaccurate meanings. (For instance, we can't map
// general condition code to a more specific type, since that is not
// representative of the source data.)
Dataset<Mapping> mappings = conceptMaps.getMappings(conceptMapUriToVersion).filter("equivalence in ('equivalent', 'equals', 'wider', 'subsumes')");
// Group mappings by their concept map URI
Map<String, List<Mapping>> groupedMappings = mappings.collectAsList().stream().collect(Collectors.groupingBy(Mapping::getConceptMapUri));
Map<String, BroadcastableConceptMap> broadcastableMaps = new HashMap<>();
for (String conceptMapUri : sortedMapsToLoad) {
ConceptMap map = mapsToLoad.get(conceptMapUri);
Set<String> children = getMapChildren(map);
List<BroadcastableConceptMap> childMaps = children.stream().map(child -> broadcastableMaps.get(child)).collect(Collectors.toList());
BroadcastableConceptMap broadcastableConceptMap = new BroadcastableConceptMap(conceptMapUri, groupedMappings.getOrDefault(conceptMapUri, Collections.emptyList()), childMaps);
broadcastableMaps.put(conceptMapUri, broadcastableConceptMap);
}
JavaSparkContext ctx = new JavaSparkContext(conceptMaps.getMaps().sparkSession().sparkContext());
return ctx.broadcast(new BroadcastableMappings(broadcastableMaps));
}
use of org.apache.spark.broadcast.Broadcast in project pyramid by cheng-li.
the class SparkCBMOptimizer method updateBinaryClassifiers.
private void updateBinaryClassifiers() {
if (logger.isDebugEnabled()) {
logger.debug("start updateBinaryClassifiers");
}
Classifier.ProbabilityEstimator[][] localBinaryClassifiers = cbm.binaryClassifiers;
double[][] localGammasT = gammasT;
Broadcast<MultiLabelClfDataSet> localDataSetBroadcast = dataSetBroadCast;
Broadcast<double[][][]> localTargetsBroadcast = targetDisBroadCast;
double localVariance = priorVarianceBinary;
List<BinaryTask> binaryTaskList = new ArrayList<>();
for (int k = 0; k < cbm.numComponents; k++) {
for (int l = 0; l < cbm.numLabels; l++) {
LogisticRegression logisticRegression = (LogisticRegression) localBinaryClassifiers[k][l];
double[] weights = localGammasT[k];
binaryTaskList.add(new BinaryTask(k, l, logisticRegression, weights));
}
}
JavaRDD<BinaryTask> binaryTaskRDD = sparkContext.parallelize(binaryTaskList, binaryTaskList.size());
List<BinaryTaskResult> results = binaryTaskRDD.map(binaryTask -> {
int labelIndex = binaryTask.classIndex;
// each element in rdd should contain its full information
return updateBinaryLogisticRegression(binaryTask.componentIndex, binaryTask.classIndex, binaryTask.logisticRegression, localDataSetBroadcast.value(), binaryTask.weights, localTargetsBroadcast.value()[labelIndex], localVariance);
}).collect();
for (BinaryTaskResult result : results) {
cbm.binaryClassifiers[result.componentIndex][result.classIndex] = result.binaryClassifier;
}
// IntStream.range(0, cbm.numComponents).forEach(this::updateBinaryClassifiers);
if (logger.isDebugEnabled()) {
logger.debug("finish updateBinaryClassifiers");
}
}
use of org.apache.spark.broadcast.Broadcast in project beam by apache.
the class SparkBatchPortablePipelineTranslator method broadcastSideInputs.
/**
* Broadcast the side inputs of an executable stage. *This can be expensive.*
*
* @return Map from PCollection ID to Spark broadcast variable and coder to decode its contents.
*/
private static <SideInputT> ImmutableMap<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>> broadcastSideInputs(RunnerApi.ExecutableStagePayload stagePayload, SparkTranslationContext context) {
Map<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>> broadcastVariables = new HashMap<>();
for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
RunnerApi.Components stagePayloadComponents = stagePayload.getComponents();
String collectionId = stagePayloadComponents.getTransformsOrThrow(sideInputId.getTransformId()).getInputsOrThrow(sideInputId.getLocalName());
if (broadcastVariables.containsKey(collectionId)) {
// This PCollection has already been broadcast.
continue;
}
Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>> tuple2 = broadcastSideInput(collectionId, stagePayloadComponents, context);
broadcastVariables.put(collectionId, tuple2);
}
return ImmutableMap.copyOf(broadcastVariables);
}
Aggregations