Search in sources :

Example 96 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class RDDConverterUtils method binaryBlockToDataFrame.

public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<MatrixIndexes, MatrixBlock> in, MatrixCharacteristics mc, boolean toVector) {
    if (!mc.colsKnown())
        throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
    // slice blocks into rows, align and convert into data frame rows
    JavaRDD<Row> rowsRDD = in.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getRowsPerBlock())).groupByKey().map(new ConvertRowBlocksToRows((int) mc.getCols(), mc.getColsPerBlock(), toVector));
    // create data frame schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
    if (toVector)
        fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
    else {
        // row
        for (int i = 1; i <= mc.getCols(); i++) fields.add(DataTypes.createStructField("C" + i, DataTypes.DoubleType, false));
    }
    // rdd to data frame conversion
    return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) StructField(org.apache.spark.sql.types.StructField) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint)

Example 97 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class RDDConverterUtilsExt method stringDataFrameToVectorDataFrame.

/**
 * Convert a dataframe of comma-separated string rows to a dataframe of
 * ml.linalg.Vector rows.
 *
 * <p>
 * Example input rows:<br>
 *
 * <code>
 * ((1.2, 4.3, 3.4))<br>
 * (1.2, 3.4, 2.2)<br>
 * [[1.2, 34.3, 1.2, 1.25]]<br>
 * [1.2, 3.4]<br>
 * </code>
 *
 * @param sparkSession
 *            Spark Session
 * @param inputDF
 *            dataframe of comma-separated row strings to convert to
 *            dataframe of ml.linalg.Vector rows
 * @return dataframe of ml.linalg.Vector rows
 */
public static Dataset<Row> stringDataFrameToVectorDataFrame(SparkSession sparkSession, Dataset<Row> inputDF) {
    StructField[] oldSchema = inputDF.schema().fields();
    StructField[] newSchema = new StructField[oldSchema.length];
    for (int i = 0; i < oldSchema.length; i++) {
        String colName = oldSchema[i].name();
        newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true);
    }
    // converter
    class StringToVector implements Function<Tuple2<Row, Long>, Row> {

        private static final long serialVersionUID = -4733816995375745659L;

        @Override
        public Row call(Tuple2<Row, Long> arg0) throws Exception {
            Row oldRow = arg0._1;
            int oldNumCols = oldRow.length();
            if (oldNumCols > 1) {
                throw new DMLRuntimeException("The row must have at most one column");
            }
            // parse the various strings. i.e
            // ((1.2, 4.3, 3.4)) or (1.2, 3.4, 2.2)
            // [[1.2, 34.3, 1.2, 1.2]] or [1.2, 3.4]
            Object[] fields = new Object[oldNumCols];
            ArrayList<Object> fieldsArr = new ArrayList<Object>();
            for (int i = 0; i < oldRow.length(); i++) {
                Object ci = oldRow.get(i);
                if (ci == null) {
                    fieldsArr.add(null);
                } else if (ci instanceof String) {
                    String cis = (String) ci;
                    StringBuffer sb = new StringBuffer(cis.trim());
                    for (int nid = 0; i < 2; i++) {
                        // nesting
                        if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')') || (sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')) {
                            sb.deleteCharAt(0);
                            sb.setLength(sb.length() - 1);
                        }
                    }
                    // have the replace code
                    String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]";
                    try {
                        // ncis [ ] will always result in double array return type
                        double[] doubles = (double[]) NumericParser.parse(ncis);
                        Vector dense = Vectors.dense(doubles);
                        fieldsArr.add(dense);
                    } catch (Exception e) {
                        // can't catch SparkException here in Java apparently
                        throw new DMLRuntimeException("Error converting to double array. " + e.getMessage(), e);
                    }
                } else {
                    throw new DMLRuntimeException("Only String is supported");
                }
            }
            Row row = RowFactory.create(fieldsArr.toArray());
            return row;
        }
    }
    // output DF
    JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
    Dataset<Row> outDF = sparkSession.createDataFrame(newRows.rdd(), DataTypes.createStructType(newSchema));
    return outDF;
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Function(org.apache.spark.api.java.function.Function) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector)

Example 98 with Row

use of org.apache.spark.sql.Row in project Gaffer by gchq.

the class GetDataFrameOfElementsExample method getDataFrameOfElementsWithEdgeGroup.

public void getDataFrameOfElementsWithEdgeGroup(final SQLContext sqlc, final Graph graph) throws OperationException {
    ROOT_LOGGER.setLevel(Level.INFO);
    log("#### " + getMethodNameAsSentence(0) + "\n");
    printGraph();
    ROOT_LOGGER.setLevel(Level.OFF);
    final GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder().view(new View.Builder().edge("edge").build()).sqlContext(sqlc).build();
    final Dataset<Row> df = graph.execute(operation, new User("user01"));
    // Show
    String result = df.showString(100, 20);
    ROOT_LOGGER.setLevel(Level.INFO);
    printJava("GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder()\n" + "                .view(new View.Builder()\n" + "                        .entity(\"edge\")\n" + "                        .build()).\n" + "                .sqlContext(sqlc)\n" + "                .build();\n" + "Dataset<Row> df = getGraph().execute(operation, new User(\"user01\"));\n" + "df.show();");
    log("The results are:");
    log("```");
    log(result.substring(0, result.length() - 2));
    log("```");
    ROOT_LOGGER.setLevel(Level.OFF);
    // Restrict to edges involving given vertices
    final Dataset<Row> seeded = df.filter("src = 1 OR src = 3");
    result = seeded.showString(100, 20);
    ROOT_LOGGER.setLevel(Level.INFO);
    printJava("df.filter(\"src = 1 OR src = 3\").show();");
    log("The results are:");
    log("```");
    log(result.substring(0, result.length() - 2));
    log("```");
    ROOT_LOGGER.setLevel(Level.OFF);
    // Filter by property
    final Dataset<Row> filtered = df.filter("count > 1");
    result = filtered.showString(100, 20);
    ROOT_LOGGER.setLevel(Level.INFO);
    printJava("df.filter(\"count > 1\").show();");
    log("The results are:");
    log("```");
    log(result.substring(0, result.length() - 2));
    log("```");
    ROOT_LOGGER.setLevel(Level.OFF);
}
Also used : GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) User(uk.gov.gchq.gaffer.user.User) Row(org.apache.spark.sql.Row) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View)

Example 99 with Row

use of org.apache.spark.sql.Row in project Gaffer by gchq.

the class AccumuloStoreRelationTest method testBuildScanSpecifyColumnsAndFiltersWithView.

private void testBuildScanSpecifyColumnsAndFiltersWithView(final String name, final View view, final String[] requiredColumns, final Filter[] filters, final Predicate<Element> returnElement) throws OperationException, StoreException {
    // Given
    final SQLContext sqlContext = getSqlContext(name);
    final Schema schema = getSchema();
    final AccumuloProperties properties = AccumuloProperties.loadStoreProperties(getClass().getResourceAsStream("/store.properties"));
    final SingleUseMockAccumuloStore store = new SingleUseMockAccumuloStore();
    store.initialise(schema, properties);
    addElements(store);
    // When
    final AccumuloStoreRelation relation = new AccumuloStoreRelation(sqlContext, Collections.emptyList(), view, store, new User());
    final RDD<Row> rdd = relation.buildScan(requiredColumns, filters);
    final Row[] returnedElements = (Row[]) rdd.collect();
    // Then
    //  - Actual results are:
    final Set<Row> results = new HashSet<>();
    for (int i = 0; i < returnedElements.length; i++) {
        results.add(returnedElements[i]);
    }
    //  - Expected results are:
    final SchemaToStructTypeConverter schemaConverter = new SchemaToStructTypeConverter(schema, view, new ArrayList<>());
    final ConvertElementToRow elementConverter = new ConvertElementToRow(new LinkedHashSet<>(Arrays.asList(requiredColumns)), schemaConverter.getPropertyNeedsConversion(), schemaConverter.getConverterByProperty());
    final Set<Row> expectedRows = new HashSet<>();
    StreamSupport.stream(getElements().spliterator(), false).filter(returnElement).map(elementConverter::apply).forEach(expectedRows::add);
    assertEquals(expectedRows, results);
    sqlContext.sparkContext().stop();
}
Also used : SingleUseMockAccumuloStore(uk.gov.gchq.gaffer.accumulostore.SingleUseMockAccumuloStore) User(uk.gov.gchq.gaffer.user.User) AccumuloProperties(uk.gov.gchq.gaffer.accumulostore.AccumuloProperties) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ConvertElementToRow(uk.gov.gchq.gaffer.spark.operation.dataframe.ConvertElementToRow) Row(org.apache.spark.sql.Row) SchemaToStructTypeConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.schema.SchemaToStructTypeConverter) SQLContext(org.apache.spark.sql.SQLContext) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ConvertElementToRow(uk.gov.gchq.gaffer.spark.operation.dataframe.ConvertElementToRow)

Example 100 with Row

use of org.apache.spark.sql.Row in project Gaffer by gchq.

the class AccumuloStoreRelationTest method testBuildScanSpecifyColumnsWithView.

private void testBuildScanSpecifyColumnsWithView(final String name, final View view, final String[] requiredColumns, final Predicate<Element> returnElement) throws OperationException, StoreException {
    // Given
    final SQLContext sqlContext = getSqlContext(name);
    final Schema schema = getSchema();
    final AccumuloProperties properties = AccumuloProperties.loadStoreProperties(getClass().getResourceAsStream("/store.properties"));
    final SingleUseMockAccumuloStore store = new SingleUseMockAccumuloStore();
    store.initialise(schema, properties);
    addElements(store);
    // When
    final AccumuloStoreRelation relation = new AccumuloStoreRelation(sqlContext, Collections.emptyList(), view, store, new User());
    final RDD<Row> rdd = relation.buildScan(requiredColumns);
    final Row[] returnedElements = (Row[]) rdd.collect();
    // Then
    //  - Actual results are:
    final Set<Row> results = new HashSet<>();
    for (int i = 0; i < returnedElements.length; i++) {
        results.add(returnedElements[i]);
    }
    //  - Expected results are:
    final SchemaToStructTypeConverter schemaConverter = new SchemaToStructTypeConverter(schema, view, new ArrayList<>());
    final ConvertElementToRow elementConverter = new ConvertElementToRow(new LinkedHashSet<>(Arrays.asList(requiredColumns)), schemaConverter.getPropertyNeedsConversion(), schemaConverter.getConverterByProperty());
    final Set<Row> expectedRows = new HashSet<>();
    StreamSupport.stream(getElements().spliterator(), false).filter(returnElement).map(elementConverter::apply).forEach(expectedRows::add);
    assertEquals(expectedRows, results);
    sqlContext.sparkContext().stop();
}
Also used : SingleUseMockAccumuloStore(uk.gov.gchq.gaffer.accumulostore.SingleUseMockAccumuloStore) User(uk.gov.gchq.gaffer.user.User) AccumuloProperties(uk.gov.gchq.gaffer.accumulostore.AccumuloProperties) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ConvertElementToRow(uk.gov.gchq.gaffer.spark.operation.dataframe.ConvertElementToRow) Row(org.apache.spark.sql.Row) SchemaToStructTypeConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.schema.SchemaToStructTypeConverter) SQLContext(org.apache.spark.sql.SQLContext) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ConvertElementToRow(uk.gov.gchq.gaffer.spark.operation.dataframe.ConvertElementToRow)

Aggregations

Row (org.apache.spark.sql.Row)129 Test (org.junit.Test)60 Script (org.apache.sysml.api.mlcontext.Script)53 StructType (org.apache.spark.sql.types.StructType)50 ArrayList (java.util.ArrayList)48 StructField (org.apache.spark.sql.types.StructField)46 SparkSession (org.apache.spark.sql.SparkSession)43 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)19 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)19 MLResults (org.apache.sysml.api.mlcontext.MLResults)18 DenseVector (org.apache.spark.ml.linalg.DenseVector)16 Vector (org.apache.spark.ml.linalg.Vector)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 SQLContext (org.apache.spark.sql.SQLContext)12 User (uk.gov.gchq.gaffer.user.User)12 HashSet (java.util.HashSet)10 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)9 Tuple2 (scala.Tuple2)9 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)9