use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.
the class WriteSPInstruction method processFrameWriteInstruction.
@SuppressWarnings("unchecked")
protected void processFrameWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi, ValueType[] schema) throws IOException {
// get input rdd
JavaPairRDD<Long, FrameBlock> in1 = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if (oi == OutputInfo.TextCellOutputInfo) {
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToTextCell(in1, mc);
customSaveTextFile(out, fname, false);
} else if (oi == OutputInfo.CSVOutputInfo) {
CSVFileFormatProperties props = (formatProperties != null) ? (CSVFileFormatProperties) formatProperties : null;
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToCsv(in1, mc, props, true);
customSaveTextFile(out, fname, false);
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
JavaPairRDD<LongWritable, FrameBlock> out = in1.mapToPair(new LongFrameToLongWritableFrameFunction());
out.saveAsHadoopFile(fname, LongWritable.class, FrameBlock.class, SequenceFileOutputFormat.class);
} else {
// unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", input1.getValueType(), schema, DataType.FRAME, mc, oi, formatProperties);
}
use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.
the class SparkUtils method getEmptyBlockRDD.
/**
* Creates an RDD of empty blocks according to the given matrix characteristics. This is
* done in a scalable manner by parallelizing block ranges and generating empty blocks
* in a distributed manner, under awareness of preferred output partition sizes.
*
* @param sc spark context
* @param mc matrix characteristics
* @return pair rdd of empty matrix blocks
*/
public static JavaPairRDD<MatrixIndexes, MatrixBlock> getEmptyBlockRDD(JavaSparkContext sc, MatrixCharacteristics mc) {
// compute degree of parallelism and block ranges
long size = mc.getNumBlocks() * OptimizerUtils.estimateSizeEmptyBlock(Math.min(Math.max(mc.getRows(), 1), mc.getRowsPerBlock()), Math.min(Math.max(mc.getCols(), 1), mc.getColsPerBlock()));
int par = (int) Math.min(Math.max(SparkExecutionContext.getDefaultParallelism(true), Math.ceil(size / InfrastructureAnalyzer.getHDFSBlockSize())), mc.getNumBlocks());
long pNumBlocks = (long) Math.ceil((double) mc.getNumBlocks() / par);
// generate block offsets per partition
List<Long> offsets = LongStream.iterate(0, n -> n + pNumBlocks).limit(par).boxed().collect(Collectors.toList());
// parallelize offsets and generate all empty blocks
return (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.parallelize(offsets, par).flatMapToPair(new GenerateEmptyBlocks(mc, pNumBlocks));
}
use of org.apache.spark.api.java.JavaPairRDD in project mmtf-spark by sbl-sdsc.
the class StructureToBioJavaTest method test.
@Test
public void test() throws IOException {
List<String> pdbIds = Arrays.asList("1STP", "4HHB", "1JLP", "5X6H", "5L2G", "2MK1");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
// 1STP: 1 L-protein chain:
// 4HHB: 4 polymer chains
// 1JLP: 1 L-protein chains with non-polymer capping group (NH2)
// 5X6H: 1 L-protein and 1 DNA chain
// 5L2G: 2 DNA chains
// 2MK1: 0 polymer chains
// --------------------
// tot : 10 polymer chains
JavaDoubleRDD chainCounts = pdb.mapValues(new StructureToBioJava()).values().mapToDouble(v -> v.getPolyChains().size());
assertEquals(10, Math.round(chainCounts.sum()));
// extract polymer chains and count chains again
chainCounts = pdb.flatMapToPair(new StructureToPolymerChains()).mapValues(new StructureToBioJava()).values().mapToDouble(v -> v.getChains().size());
assertEquals(10, Math.round(chainCounts.sum()));
}
use of org.apache.spark.api.java.JavaPairRDD in project java_study by aloyschen.
the class RDD method java_pair.
/*
* spark pairRDD example
*/
public void java_pair() {
JavaSparkContext sc = getSc();
sc.setLogLevel("ERROR");
JavaRDD<String> lines = sc.parallelize(Arrays.asList("I am boy", "you are cold", "I am learning"));
JavaPairRDD<String, String> pairRDD = lines.mapToPair((PairFunction<String, String, String>) s -> new Tuple2<>(s.split(" ")[0], s));
pairRDD.foreach(line -> System.out.println("key is " + line._1));
}
use of org.apache.spark.api.java.JavaPairRDD in project cdap by caskdata.
the class SparkBatchSourceFactory method createInputRDD.
@SuppressWarnings("unchecked")
private <K, V> JavaPairRDD<K, V> createInputRDD(JavaSparkExecutionContext sec, JavaSparkContext jsc, String inputName, Class<K> keyClass, Class<V> valueClass) {
if (streams.containsKey(inputName)) {
Input.StreamInput streamInput = streams.get(inputName);
FormatSpecification formatSpec = streamInput.getBodyFormatSpec();
if (formatSpec != null) {
return (JavaPairRDD<K, V>) sec.fromStream(streamInput.getName(), formatSpec, streamInput.getStartTime(), streamInput.getEndTime(), StructuredRecord.class);
}
String decoderType = streamInput.getDecoderType();
if (decoderType == null) {
return (JavaPairRDD<K, V>) sec.fromStream(streamInput.getName(), streamInput.getStartTime(), streamInput.getEndTime(), valueClass);
} else {
try {
Class<StreamEventDecoder<K, V>> decoderClass = (Class<StreamEventDecoder<K, V>>) Thread.currentThread().getContextClassLoader().loadClass(decoderType);
return sec.fromStream(streamInput.getName(), streamInput.getStartTime(), streamInput.getEndTime(), decoderClass, keyClass, valueClass);
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
}
if (inputFormatProviders.containsKey(inputName)) {
InputFormatProvider inputFormatProvider = inputFormatProviders.get(inputName);
Configuration hConf = new Configuration();
hConf.clear();
for (Map.Entry<String, String> entry : inputFormatProvider.getInputFormatConfiguration().entrySet()) {
hConf.set(entry.getKey(), entry.getValue());
}
ClassLoader classLoader = Objects.firstNonNull(currentThread().getContextClassLoader(), getClass().getClassLoader());
try {
@SuppressWarnings("unchecked") Class<InputFormat> inputFormatClass = (Class<InputFormat>) classLoader.loadClass(inputFormatProvider.getInputFormatClassName());
return jsc.newAPIHadoopRDD(hConf, inputFormatClass, keyClass, valueClass);
} catch (ClassNotFoundException e) {
throw Throwables.propagate(e);
}
}
if (datasetInfos.containsKey(inputName)) {
DatasetInfo datasetInfo = datasetInfos.get(inputName);
return sec.fromDataset(datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
}
// which make sure one and only one of those source type will be specified.
throw new IllegalStateException("Unknown source type");
}
Aggregations