use of org.apache.spark.sql.Dataset in project systemml by apache.
the class FrameConverterTest method runConverter.
@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
JavaSparkContext sc = sec.getSparkContext();
ValueType[] lschema = schema.toArray(new ValueType[0]);
MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
switch(type) {
case CSV2BIN:
{
InputInfo iinfo = InputInfo.CSVInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2CSV:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
CSVFileFormatProperties fprop = new CSVFileFormatProperties();
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
rddOut.saveAsTextFile(fnameOut);
break;
}
case TXTCELL2BIN:
{
InputInfo iinfo = InputInfo.TextCellInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2TXTCELL:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
rddOut.saveAsTextFile(fnameOut);
break;
}
case MAT2BIN:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2MAT:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
break;
}
case DFRM2BIN:
{
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
// Create DataFrame
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2DFRM:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
default:
throw new RuntimeException("Unsuported converter type: " + type.toString());
}
sec.close();
}
use of org.apache.spark.sql.Dataset in project bunsen by cerner.
the class ConceptMaps method getMappings.
/**
* Returns a dataset with the mappings for each uri and version.
*
* @param uriToVersion a map of concept map URI to the version to load
* @return a dataset of mappings for the given URIs and versions.
*/
public Dataset<Mapping> getMappings(Map<String, String> uriToVersion) {
JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());
Broadcast<Map<String, String>> broadcastMaps = context.broadcast(uriToVersion);
return this.mappings.filter((FilterFunction<Mapping>) mapping -> {
String latestVersion = broadcastMaps.getValue().get(mapping.getConceptMapUri());
return latestVersion != null && latestVersion.equals(mapping.getConceptMapVersion());
});
}
use of org.apache.spark.sql.Dataset in project bunsen by cerner.
the class ValueSets method getValues.
/**
* Returns a dataset with the values for each element in the map of uri to version.
*
* @param uriToVersion a map of value set URI to the version to load
* @return a dataset of values for the given URIs and versions.
*/
public Dataset<Value> getValues(Map<String, String> uriToVersion) {
JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());
Broadcast<Map<String, String>> broadcastUrisToVersion = context.broadcast(uriToVersion);
return this.values.filter((FilterFunction<Value>) value -> {
String latestVersion = broadcastUrisToVersion.getValue().get(value.getValueSetUri());
return latestVersion != null && latestVersion.equals(value.getValueSetVersion());
});
}
use of org.apache.spark.sql.Dataset in project bunsen by cerner.
the class BroadcastableMappings method broadcast.
/**
* Broadcast mappings stored in the given conceptMaps instance that match the given
* conceptMapUris.
*
* @param conceptMaps the {@link ConceptMaps} instance with the content to broadcast
* @param conceptMapUriToVersion map of the concept map URIs to broadcast to their versions.
* @return a broadcast variable containing a mappings object usable in UDFs.
*/
public static Broadcast<BroadcastableMappings> broadcast(ConceptMaps conceptMaps, Map<String, String> conceptMapUriToVersion) {
Map<String, ConceptMap> mapsToLoad = conceptMaps.getMaps().collectAsList().stream().filter(conceptMap -> conceptMap.getVersion().equals(conceptMapUriToVersion.get(conceptMap.getUrl()))).collect(Collectors.toMap(ConceptMap::getUrl, Function.identity()));
// Expand the concept maps to load and sort them so dependencies are before
// their dependents in the list.
List<String> sortedMapsToLoad = sortMapsToLoad(conceptMapUriToVersion.keySet(), mapsToLoad);
// Since this is used to map from one system to another, we use only targets
// that don't introduce inaccurate meanings. (For instance, we can't map
// general condition code to a more specific type, since that is not
// representative of the source data.)
Dataset<Mapping> mappings = conceptMaps.getMappings(conceptMapUriToVersion).filter("equivalence in ('equivalent', 'equals', 'wider', 'subsumes')");
// Group mappings by their concept map URI
Map<String, List<Mapping>> groupedMappings = mappings.collectAsList().stream().collect(Collectors.groupingBy(Mapping::getConceptMapUri));
Map<String, BroadcastableConceptMap> broadcastableMaps = new HashMap<>();
for (String conceptMapUri : sortedMapsToLoad) {
ConceptMap map = mapsToLoad.get(conceptMapUri);
Set<String> children = getMapChildren(map);
List<BroadcastableConceptMap> childMaps = children.stream().map(child -> broadcastableMaps.get(child)).collect(Collectors.toList());
BroadcastableConceptMap broadcastableConceptMap = new BroadcastableConceptMap(conceptMapUri, groupedMappings.getOrDefault(conceptMapUri, Collections.emptyList()), childMaps);
broadcastableMaps.put(conceptMapUri, broadcastableConceptMap);
}
JavaSparkContext ctx = new JavaSparkContext(conceptMaps.getMaps().sparkSession().sparkContext());
return ctx.broadcast(new BroadcastableMappings(broadcastableMaps));
}
use of org.apache.spark.sql.Dataset in project gaffer-doc by gchq.
the class OperationExample method printResult.
public <RESULT_TYPE> void printResult(final RESULT_TYPE result) {
print("Result:");
print("\n{% codetabs name=\"Java\", type=\"java\" -%}");
if (result instanceof Iterable) {
for (final Object item : (Iterable) result) {
if (item instanceof Walk) {
final Walk walk = (Walk) item;
print(Walk.class.getName() + walk.getVerticesOrdered().stream().map(Object::toString).collect(Collectors.joining(" --> ", "[ ", " ]")));
} else {
print(item.toString());
}
}
} else if (result instanceof Map) {
final Map<?, ?> resultMap = (Map) result;
for (final Map.Entry<?, ?> entry : resultMap.entrySet()) {
print(entry.getKey() + ":");
if (entry.getValue() instanceof Iterable) {
for (final Object item : (Iterable) entry.getValue()) {
print(" " + item.toString());
}
} else {
print(" " + entry.getValue().toString());
}
}
} else if (result instanceof Stream) {
final Stream stream = (Stream) result;
stream.forEach(item -> print(item.toString()));
} else if (result instanceof Object[]) {
final Object[] array = (Object[]) result;
for (int i = 0; i < array.length; i++) {
print(array[i].toString());
}
} else if (result instanceof JavaRDD) {
final List<Element> elements = ((JavaRDD) result).collect();
for (final Element e : elements) {
print(e.toString());
}
} else if (result instanceof Dataset) {
final Dataset<Row> dataset = ((Dataset) result);
final String resultStr = dataset.showString(100, 20);
print(resultStr.substring(0, resultStr.length() - 2));
} else if (result instanceof Schema) {
print(DocUtil.getJson(result));
} else if (null != result) {
print(result.toString());
} else {
throw new RuntimeException("Operation result was null");
}
try {
final String json = DocUtil.getFullJson(result);
print(WalkthroughStrSubstitutor.JSON_CODE_MARKER);
print(json);
} catch (final Exception e) {
// ignore error - just don't display the json
}
print("{%- endcodetabs %}\n");
}
Aggregations