use of scala.Tuple2 in project beam by apache.
the class SparkGroupAlsoByWindowViaWindowSet method groupAlsoByWindow.
public static <K, InputT, W extends BoundedWindow> JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> groupAlsoByWindow(JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>>> inputDStream, final Coder<K> keyCoder, final Coder<WindowedValue<InputT>> wvCoder, final WindowingStrategy<?, W> windowingStrategy, final SparkRuntimeContext runtimeContext, final List<Integer> sourceIds) {
final IterableCoder<WindowedValue<InputT>> itrWvCoder = IterableCoder.of(wvCoder);
final Coder<InputT> iCoder = ((FullWindowedValueCoder<InputT>) wvCoder).getValueCoder();
final Coder<? extends BoundedWindow> wCoder = ((FullWindowedValueCoder<InputT>) wvCoder).getWindowCoder();
final Coder<WindowedValue<KV<K, Iterable<InputT>>>> wvKvIterCoder = FullWindowedValueCoder.of(KvCoder.of(keyCoder, IterableCoder.of(iCoder)), wCoder);
final TimerInternals.TimerDataCoder timerDataCoder = TimerInternals.TimerDataCoder.of(windowingStrategy.getWindowFn().windowCoder());
long checkpointDurationMillis = runtimeContext.getPipelineOptions().as(SparkPipelineOptions.class).getCheckpointDurationMillis();
// we have to switch to Scala API to avoid Optional in the Java API, see: SPARK-4819.
// we also have a broader API for Scala (access to the actual key and entire iterator).
// we use coders to convert objects in the PCollection to byte arrays, so they
// can be transferred over the network for the shuffle and be in serialized form
// for checkpointing.
// for readability, we add comments with actual type next to byte[].
// to shorten line length, we use:
//---- WV: WindowedValue
//---- Iterable: Itr
//---- AccumT: A
//---- InputT: I
DStream<Tuple2<ByteArray, byte[]>> /*Itr<WV<I>>*/
pairDStream = inputDStream.transformToPair(new Function<JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>>>, JavaPairRDD<ByteArray, byte[]>>() {
// we use mapPartitions with the RDD API because its the only available API
// that allows to preserve partitioning.
@Override
public JavaPairRDD<ByteArray, byte[]> call(JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>>> rdd) throws Exception {
return rdd.mapPartitions(TranslationUtils.functionToFlatMapFunction(WindowingHelpers.<KV<K, Iterable<WindowedValue<InputT>>>>unwindowFunction()), true).mapPartitionsToPair(TranslationUtils.<K, Iterable<WindowedValue<InputT>>>toPairFlatMapFunction(), true).mapPartitionsToPair(TranslationUtils.pairFunctionToPairFlatMapFunction(CoderHelpers.toByteFunction(keyCoder, itrWvCoder)), true);
}
}).dstream();
PairDStreamFunctions<ByteArray, byte[]> pairDStreamFunctions = DStream.toPairDStreamFunctions(pairDStream, JavaSparkContext$.MODULE$.<ByteArray>fakeClassTag(), JavaSparkContext$.MODULE$.<byte[]>fakeClassTag(), null);
int defaultNumPartitions = pairDStreamFunctions.defaultPartitioner$default$1();
Partitioner partitioner = pairDStreamFunctions.defaultPartitioner(defaultNumPartitions);
// use updateStateByKey to scan through the state and update elements and timers.
DStream<Tuple2<ByteArray, Tuple2<StateAndTimers, List<byte[]>>>> /*WV<KV<K, Itr<I>>>*/
firedStream = pairDStreamFunctions.updateStateByKey(new SerializableFunction1<scala.collection.Iterator<Tuple3</*K*/
ByteArray, Seq<byte[]>, Option<Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>>, scala.collection.Iterator<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>>() {
@Override
public scala.collection.Iterator<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>> apply(final scala.collection.Iterator<Tuple3</*K*/
ByteArray, Seq<byte[]>, Option<Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>> iter) {
//--- ACTUAL STATEFUL OPERATION:
//
// Input Iterator: the partition (~bundle) of a cogrouping of the input
// and the previous state (if exists).
//
// Output Iterator: the output key, and the updated state.
//
// possible input scenarios for (K, Seq, Option<S>):
// (1) Option<S>.isEmpty: new data with no previous state.
// (2) Seq.isEmpty: no new data, but evaluating previous state (timer-like behaviour).
// (3) Seq.nonEmpty && Option<S>.isDefined: new data with previous state.
final SystemReduceFn<K, InputT, Iterable<InputT>, Iterable<InputT>, W> reduceFn = SystemReduceFn.buffering(((FullWindowedValueCoder<InputT>) wvCoder).getValueCoder());
final OutputWindowedValueHolder<K, InputT> outputHolder = new OutputWindowedValueHolder<>();
// use in memory Aggregators since Spark Accumulators are not resilient
// in stateful operators, once done with this partition.
final MetricsContainerImpl cellProvider = new MetricsContainerImpl("cellProvider");
final CounterCell droppedDueToClosedWindow = cellProvider.getCounter(MetricName.named(SparkGroupAlsoByWindowViaWindowSet.class, GroupAlsoByWindowsAggregators.DROPPED_DUE_TO_CLOSED_WINDOW_COUNTER));
final CounterCell droppedDueToLateness = cellProvider.getCounter(MetricName.named(SparkGroupAlsoByWindowViaWindowSet.class, GroupAlsoByWindowsAggregators.DROPPED_DUE_TO_LATENESS_COUNTER));
AbstractIterator<Tuple2<ByteArray, Tuple2<StateAndTimers, List<byte[]>>>> /*WV<KV<K, Itr<I>>>*/
outIter = new AbstractIterator<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>() {
@Override
protected Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>> computeNext() {
// (possibly) previous-state and (possibly) new data.
while (iter.hasNext()) {
// for each element in the partition:
Tuple3<ByteArray, Seq<byte[]>, Option<Tuple2<StateAndTimers, List<byte[]>>>> next = iter.next();
ByteArray encodedKey = next._1();
K key = CoderHelpers.fromByteArray(encodedKey.getValue(), keyCoder);
Seq<byte[]> seq = next._2();
Option<Tuple2<StateAndTimers, List<byte[]>>> prevStateAndTimersOpt = next._3();
SparkStateInternals<K> stateInternals;
SparkTimerInternals timerInternals = SparkTimerInternals.forStreamFromSources(sourceIds, GlobalWatermarkHolder.get());
// get state(internals) per key.
if (prevStateAndTimersOpt.isEmpty()) {
// no previous state.
stateInternals = SparkStateInternals.forKey(key);
} else {
// with pre-existing state.
StateAndTimers prevStateAndTimers = prevStateAndTimersOpt.get()._1();
stateInternals = SparkStateInternals.forKeyAndState(key, prevStateAndTimers.getState());
Collection<byte[]> serTimers = prevStateAndTimers.getTimers();
timerInternals.addTimers(SparkTimerInternals.deserializeTimers(serTimers, timerDataCoder));
}
ReduceFnRunner<K, InputT, Iterable<InputT>, W> reduceFnRunner = new ReduceFnRunner<>(key, windowingStrategy, ExecutableTriggerStateMachine.create(TriggerStateMachines.stateMachineForTrigger(TriggerTranslation.toProto(windowingStrategy.getTrigger()))), stateInternals, timerInternals, outputHolder, new UnsupportedSideInputReader("GroupAlsoByWindow"), reduceFn, runtimeContext.getPipelineOptions());
// clear before potential use.
outputHolder.clear();
if (!seq.isEmpty()) {
// new input for key.
try {
Iterable<WindowedValue<InputT>> elementsIterable = CoderHelpers.fromByteArray(seq.head(), itrWvCoder);
Iterable<WindowedValue<InputT>> validElements = LateDataUtils.dropExpiredWindows(key, elementsIterable, timerInternals, windowingStrategy, droppedDueToLateness);
reduceFnRunner.processElements(validElements);
} catch (Exception e) {
throw new RuntimeException("Failed to process element with ReduceFnRunner", e);
}
} else if (stateInternals.getState().isEmpty()) {
// no input and no state -> GC evict now.
continue;
}
try {
// advance the watermark to HWM to fire by timers.
timerInternals.advanceWatermark();
// call on timers that are ready.
reduceFnRunner.onTimers(timerInternals.getTimersReadyToProcess());
} catch (Exception e) {
throw new RuntimeException("Failed to process ReduceFnRunner onTimer.", e);
}
// this is mostly symbolic since actual persist is done by emitting output.
reduceFnRunner.persist();
// obtain output, if fired.
List<WindowedValue<KV<K, Iterable<InputT>>>> outputs = outputHolder.get();
if (!outputs.isEmpty() || !stateInternals.getState().isEmpty()) {
StateAndTimers updated = new StateAndTimers(stateInternals.getState(), SparkTimerInternals.serializeTimers(timerInternals.getTimers(), timerDataCoder));
// persist Spark's state by outputting.
List<byte[]> serOutput = CoderHelpers.toByteArrays(outputs, wvKvIterCoder);
return new Tuple2<>(encodedKey, new Tuple2<>(updated, serOutput));
}
// an empty state with no output, can be evicted completely - do nothing.
}
return endOfData();
}
};
// log if there's something to log.
long lateDropped = droppedDueToLateness.getCumulative();
if (lateDropped > 0) {
LOG.info(String.format("Dropped %d elements due to lateness.", lateDropped));
droppedDueToLateness.inc(-droppedDueToLateness.getCumulative());
}
long closedWindowDropped = droppedDueToClosedWindow.getCumulative();
if (closedWindowDropped > 0) {
LOG.info(String.format("Dropped %d elements due to closed window.", closedWindowDropped));
droppedDueToClosedWindow.inc(-droppedDueToClosedWindow.getCumulative());
}
return scala.collection.JavaConversions.asScalaIterator(outIter);
}
}, partitioner, true, JavaSparkContext$.MODULE$.<Tuple2<StateAndTimers, List<byte[]>>>fakeClassTag());
if (checkpointDurationMillis > 0) {
firedStream.checkpoint(new Duration(checkpointDurationMillis));
}
// go back to Java now.
JavaPairDStream<ByteArray, Tuple2<StateAndTimers, List<byte[]>>> /*WV<KV<K, Itr<I>>>*/
javaFiredStream = JavaPairDStream.fromPairDStream(firedStream, JavaSparkContext$.MODULE$.<ByteArray>fakeClassTag(), JavaSparkContext$.MODULE$.<Tuple2<StateAndTimers, List<byte[]>>>fakeClassTag());
// filter state-only output (nothing to fire) and remove the state from the output.
return javaFiredStream.filter(new Function<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>, Boolean>() {
@Override
public Boolean call(Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>> t2) throws Exception {
// filter output if defined.
return !t2._2()._2().isEmpty();
}
}).flatMap(new FlatMapFunction<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>, WindowedValue<KV<K, Iterable<InputT>>>>() {
@Override
public Iterable<WindowedValue<KV<K, Iterable<InputT>>>> call(Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>> t2) throws Exception {
// return in serialized form.
return CoderHelpers.fromByteArrays(t2._2()._2(), wvKvIterCoder);
}
});
}
use of scala.Tuple2 in project incubator-systemml by apache.
the class DataTransform method spDataTransform.
public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
JobConf job = new JobConf();
FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names and construct output header
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
int numColumns = colNamesToIds.size();
String outHeader = getOutputHeader(fs, headerLine, oprnds);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
// Construct RDD for input data
@SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
long numRowsTf = 0, numColumnsTf = 0;
JavaPairRDD<Long, String> tfPairRDD = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// convert to csv output format (serialized longwritable/text)
JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
if (outtfPairRDD != null) {
MatrixObject outMO = outputs[0];
String outVar = outMO.getVarName();
outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
sec.addLineageRDD(outVar, inst.getParams().get("target"));
//update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
mcOut.setDimension(numRowsTf, numColumnsTf);
mcOut.setNonZeros(-1);
}
}
use of scala.Tuple2 in project incubator-systemml by apache.
the class ApplyTfCSVSPARK method runSparkJob.
/**
* Apply transformation metadata and generate the result in CSV format, as a
* JavaRDD of Strings.
*
* @param sec spark execution context
* @param inputRDD input rdd
* @param tfMtdPath transform metadata path
* @param spec transform specification as json string
* @param tmpPath temporary file path
* @param prop csv file format properties
* @param numCols number of columns
* @param headerLine header line
* @return JavaPairRDD of long-strings
* @throws IOException if IOException occurs
* @throws ClassNotFoundException if ClassNotFoundException occurs
* @throws InterruptedException if InterruptedException occurs
* @throws IllegalArgumentException if IllegalArgumentException occurs
* @throws JSONException if JSONException occurs
*/
public static JavaPairRDD<Long, String> runSparkJob(SparkExecutionContext sec, JavaRDD<Tuple2<LongWritable, Text>> inputRDD, String tfMtdPath, String spec, String tmpPath, CSVFileFormatProperties prop, int numCols, String headerLine) throws IOException, ClassNotFoundException, InterruptedException, IllegalArgumentException, JSONException {
// Load transformation metadata and broadcast it
String[] naStrings = TfUtils.parseNAStrings(prop.getNAStrings());
JSONObject jspec = new JSONObject(spec);
TfUtils _tfmapper = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), naStrings, jspec, numCols, tfMtdPath, null, tmpPath);
_tfmapper.loadTfMetadata();
Broadcast<TfUtils> bcast_tf = sec.getSparkContext().broadcast(_tfmapper);
/*
* Construct transformation metadata (map-side) -- the logic is similar
* to GTFMTDMapper
*
* Note: The result of mapPartitionsWithIndex is cached so that the
* transformed data is not redundantly computed multiple times
*/
JavaPairRDD<Long, String> applyRDD = inputRDD.mapPartitionsWithIndex(new ApplyTfCSVMap(bcast_tf), true).mapToPair(new PairFunction<String, Long, String>() {
private static final long serialVersionUID = 3868143093999082931L;
@Override
public Tuple2<Long, String> call(String t) throws Exception {
return new Tuple2<Long, String>(new Long(1), t);
}
}).cache();
/*
* An action to force execution of apply()
*
* We need to trigger the execution of this RDD so as to ensure the
* creation of a few metadata files (headers, dummycoded information,
* etc.), which are referenced in the caller function.
*/
applyRDD.count();
return applyRDD;
}
use of scala.Tuple2 in project geode by apache.
the class RDDSaveJavaDemo method main.
public static void main(String[] argv) {
if (argv.length != 1) {
System.err.printf("Usage: RDDSaveJavaDemo <locators>\n");
return;
}
SparkConf conf = new SparkConf().setAppName("RDDSaveJavaDemo");
conf.set(GeodeLocatorPropKey, argv[0]);
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> data = new ArrayList<String>();
data.add("abcdefg");
data.add("abcdefgh");
data.add("abcdefghi");
JavaRDD<String> rdd = sc.parallelize(data);
GeodeConnectionConf connConf = GeodeConnectionConf.apply(conf);
PairFunction<String, String, Integer> func = new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<String, Integer>(s, s.length());
}
};
javaFunctions(rdd).saveToGeode("str_int_region", func, connConf);
sc.stop();
}
use of scala.Tuple2 in project gora by apache.
the class SparkWordCount method wordCount.
public int wordCount(DataStore<String, WebPage> inStore, DataStore<String, TokenDatum> outStore) throws IOException {
//Spark engine initialization
GoraSparkEngine<String, WebPage> goraSparkEngine = new GoraSparkEngine<>(String.class, WebPage.class);
SparkConf sparkConf = new SparkConf().setAppName("Gora Spark Word Count Application").setMaster("local");
Class[] c = new Class[1];
c[0] = inStore.getPersistentClass();
sparkConf.registerKryoClasses(c);
//
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaPairRDD<String, WebPage> goraRDD = goraSparkEngine.initialize(sc, inStore);
long count = goraRDD.count();
log.info("Total Web page count: {}", count);
JavaRDD<Tuple2<String, Long>> mappedGoraRdd = goraRDD.values().map(mapFunc);
JavaPairRDD<String, Long> reducedGoraRdd = JavaPairRDD.fromJavaRDD(mappedGoraRdd).reduceByKey(redFunc);
//Print output for debug purpose
log.info("SparkWordCount debug purpose TokenDatum print starts:");
Map<String, Long> tokenDatumMap = reducedGoraRdd.collectAsMap();
for (String key : tokenDatumMap.keySet()) {
log.info(key);
log.info(tokenDatumMap.get(key).toString());
}
log.info("SparkWordCount debug purpose TokenDatum print ends:");
//
//write output to datastore
Configuration sparkHadoopConf = goraSparkEngine.generateOutputConf(outStore);
reducedGoraRdd.saveAsNewAPIHadoopDataset(sparkHadoopConf);
return 1;
}
Aggregations