use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class RDDConverterUtils method binaryBlockToDataFrame.
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<MatrixIndexes, MatrixBlock> in, MatrixCharacteristics mc, boolean toVector) {
if (!mc.colsKnown())
throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
// slice blocks into rows, align and convert into data frame rows
JavaRDD<Row> rowsRDD = in.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getRowsPerBlock())).groupByKey().map(new ConvertRowBlocksToRows((int) mc.getCols(), mc.getColsPerBlock(), toVector));
// create data frame schema
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
if (toVector)
fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
else {
// row
for (int i = 1; i <= mc.getCols(); i++) fields.add(DataTypes.createStructField("C" + i, DataTypes.DoubleType, false));
}
// rdd to data frame conversion
return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class RDDConverterUtilsExt method stringDataFrameToVectorDataFrame.
/**
* Convert a dataframe of comma-separated string rows to a dataframe of
* ml.linalg.Vector rows.
*
* <p>
* Example input rows:<br>
*
* <code>
* ((1.2, 4.3, 3.4))<br>
* (1.2, 3.4, 2.2)<br>
* [[1.2, 34.3, 1.2, 1.25]]<br>
* [1.2, 3.4]<br>
* </code>
*
* @param sparkSession
* Spark Session
* @param inputDF
* dataframe of comma-separated row strings to convert to
* dataframe of ml.linalg.Vector rows
* @return dataframe of ml.linalg.Vector rows
*/
public static Dataset<Row> stringDataFrameToVectorDataFrame(SparkSession sparkSession, Dataset<Row> inputDF) {
StructField[] oldSchema = inputDF.schema().fields();
StructField[] newSchema = new StructField[oldSchema.length];
for (int i = 0; i < oldSchema.length; i++) {
String colName = oldSchema[i].name();
newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true);
}
// converter
class StringToVector implements Function<Tuple2<Row, Long>, Row> {
private static final long serialVersionUID = -4733816995375745659L;
@Override
public Row call(Tuple2<Row, Long> arg0) throws Exception {
Row oldRow = arg0._1;
int oldNumCols = oldRow.length();
if (oldNumCols > 1) {
throw new DMLRuntimeException("The row must have at most one column");
}
// parse the various strings. i.e
// ((1.2, 4.3, 3.4)) or (1.2, 3.4, 2.2)
// [[1.2, 34.3, 1.2, 1.2]] or [1.2, 3.4]
Object[] fields = new Object[oldNumCols];
ArrayList<Object> fieldsArr = new ArrayList<Object>();
for (int i = 0; i < oldRow.length(); i++) {
Object ci = oldRow.get(i);
if (ci == null) {
fieldsArr.add(null);
} else if (ci instanceof String) {
String cis = (String) ci;
StringBuffer sb = new StringBuffer(cis.trim());
for (int nid = 0; i < 2; i++) {
// nesting
if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')') || (sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')) {
sb.deleteCharAt(0);
sb.setLength(sb.length() - 1);
}
}
// have the replace code
String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]";
try {
// ncis [ ] will always result in double array return type
double[] doubles = (double[]) NumericParser.parse(ncis);
Vector dense = Vectors.dense(doubles);
fieldsArr.add(dense);
} catch (Exception e) {
// can't catch SparkException here in Java apparently
throw new DMLRuntimeException("Error converting to double array. " + e.getMessage(), e);
}
} else {
throw new DMLRuntimeException("Only String is supported");
}
}
Row row = RowFactory.create(fieldsArr.toArray());
return row;
}
}
// output DF
JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
Dataset<Row> outDF = sparkSession.createDataFrame(newRows.rdd(), DataTypes.createStructType(newSchema));
return outDF;
}
use of org.apache.spark.sql.Row in project Gaffer by gchq.
the class GetDataFrameOfElementsExample method getDataFrameOfElementsWithEdgeGroup.
public void getDataFrameOfElementsWithEdgeGroup(final SQLContext sqlc, final Graph graph) throws OperationException {
ROOT_LOGGER.setLevel(Level.INFO);
log("#### " + getMethodNameAsSentence(0) + "\n");
printGraph();
ROOT_LOGGER.setLevel(Level.OFF);
final GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder().view(new View.Builder().edge("edge").build()).sqlContext(sqlc).build();
final Dataset<Row> df = graph.execute(operation, new User("user01"));
// Show
String result = df.showString(100, 20);
ROOT_LOGGER.setLevel(Level.INFO);
printJava("GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder()\n" + " .view(new View.Builder()\n" + " .entity(\"edge\")\n" + " .build()).\n" + " .sqlContext(sqlc)\n" + " .build();\n" + "Dataset<Row> df = getGraph().execute(operation, new User(\"user01\"));\n" + "df.show();");
log("The results are:");
log("```");
log(result.substring(0, result.length() - 2));
log("```");
ROOT_LOGGER.setLevel(Level.OFF);
// Restrict to edges involving given vertices
final Dataset<Row> seeded = df.filter("src = 1 OR src = 3");
result = seeded.showString(100, 20);
ROOT_LOGGER.setLevel(Level.INFO);
printJava("df.filter(\"src = 1 OR src = 3\").show();");
log("The results are:");
log("```");
log(result.substring(0, result.length() - 2));
log("```");
ROOT_LOGGER.setLevel(Level.OFF);
// Filter by property
final Dataset<Row> filtered = df.filter("count > 1");
result = filtered.showString(100, 20);
ROOT_LOGGER.setLevel(Level.INFO);
printJava("df.filter(\"count > 1\").show();");
log("The results are:");
log("```");
log(result.substring(0, result.length() - 2));
log("```");
ROOT_LOGGER.setLevel(Level.OFF);
}
use of org.apache.spark.sql.Row in project Gaffer by gchq.
the class AccumuloStoreRelationTest method testBuildScanSpecifyColumnsAndFiltersWithView.
private void testBuildScanSpecifyColumnsAndFiltersWithView(final String name, final View view, final String[] requiredColumns, final Filter[] filters, final Predicate<Element> returnElement) throws OperationException, StoreException {
// Given
final SQLContext sqlContext = getSqlContext(name);
final Schema schema = getSchema();
final AccumuloProperties properties = AccumuloProperties.loadStoreProperties(getClass().getResourceAsStream("/store.properties"));
final SingleUseMockAccumuloStore store = new SingleUseMockAccumuloStore();
store.initialise(schema, properties);
addElements(store);
// When
final AccumuloStoreRelation relation = new AccumuloStoreRelation(sqlContext, Collections.emptyList(), view, store, new User());
final RDD<Row> rdd = relation.buildScan(requiredColumns, filters);
final Row[] returnedElements = (Row[]) rdd.collect();
// Then
// - Actual results are:
final Set<Row> results = new HashSet<>();
for (int i = 0; i < returnedElements.length; i++) {
results.add(returnedElements[i]);
}
// - Expected results are:
final SchemaToStructTypeConverter schemaConverter = new SchemaToStructTypeConverter(schema, view, new ArrayList<>());
final ConvertElementToRow elementConverter = new ConvertElementToRow(new LinkedHashSet<>(Arrays.asList(requiredColumns)), schemaConverter.getPropertyNeedsConversion(), schemaConverter.getConverterByProperty());
final Set<Row> expectedRows = new HashSet<>();
StreamSupport.stream(getElements().spliterator(), false).filter(returnElement).map(elementConverter::apply).forEach(expectedRows::add);
assertEquals(expectedRows, results);
sqlContext.sparkContext().stop();
}
use of org.apache.spark.sql.Row in project Gaffer by gchq.
the class AccumuloStoreRelationTest method testBuildScanSpecifyColumnsWithView.
private void testBuildScanSpecifyColumnsWithView(final String name, final View view, final String[] requiredColumns, final Predicate<Element> returnElement) throws OperationException, StoreException {
// Given
final SQLContext sqlContext = getSqlContext(name);
final Schema schema = getSchema();
final AccumuloProperties properties = AccumuloProperties.loadStoreProperties(getClass().getResourceAsStream("/store.properties"));
final SingleUseMockAccumuloStore store = new SingleUseMockAccumuloStore();
store.initialise(schema, properties);
addElements(store);
// When
final AccumuloStoreRelation relation = new AccumuloStoreRelation(sqlContext, Collections.emptyList(), view, store, new User());
final RDD<Row> rdd = relation.buildScan(requiredColumns);
final Row[] returnedElements = (Row[]) rdd.collect();
// Then
// - Actual results are:
final Set<Row> results = new HashSet<>();
for (int i = 0; i < returnedElements.length; i++) {
results.add(returnedElements[i]);
}
// - Expected results are:
final SchemaToStructTypeConverter schemaConverter = new SchemaToStructTypeConverter(schema, view, new ArrayList<>());
final ConvertElementToRow elementConverter = new ConvertElementToRow(new LinkedHashSet<>(Arrays.asList(requiredColumns)), schemaConverter.getPropertyNeedsConversion(), schemaConverter.getConverterByProperty());
final Set<Row> expectedRows = new HashSet<>();
StreamSupport.stream(getElements().spliterator(), false).filter(returnElement).map(elementConverter::apply).forEach(expectedRows::add);
assertEquals(expectedRows, results);
sqlContext.sparkContext().stop();
}
Aggregations