Search in sources :

Example 1 with GetDataFrameOfElements

use of uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements in project Gaffer by gchq.

the class GetDataFrameOfElementsExample method getDataFrameOfElementsWithEntityGroup.

public void getDataFrameOfElementsWithEntityGroup(final SQLContext sqlc, final Graph graph) throws OperationException {
    ROOT_LOGGER.setLevel(Level.INFO);
    log("#### " + getMethodNameAsSentence(0) + "\n");
    printGraph();
    ROOT_LOGGER.setLevel(Level.OFF);
    final GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder().view(new View.Builder().entity("entity").build()).sqlContext(sqlc).build();
    final Dataset<Row> df = graph.execute(operation, new User("user01"));
    // Show
    String result = df.showString(100, 20);
    ROOT_LOGGER.setLevel(Level.INFO);
    printJava("GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder()\n" + "                .view(new View.Builder()\n" + "                        .entity(\"entity\")\n" + "                        .build()).\n" + "                .sqlContext(sqlc)\n" + "                .build();\n" + "Dataset<Row> df = getGraph().execute(operation, new User(\"user01\"));\n" + "df.show();");
    log("The results are:");
    log("```");
    log(result.substring(0, result.length() - 2));
    log("```");
    ROOT_LOGGER.setLevel(Level.OFF);
    // Restrict to entities involving certain vertices
    final Dataset<Row> seeded = df.filter("vertex = 1 OR vertex = 2");
    result = seeded.showString(100, 20);
    ROOT_LOGGER.setLevel(Level.INFO);
    printJava("df.filter(\"vertex = 1 OR vertex = 2\").show();");
    log("The results are:");
    log("```");
    log(result.substring(0, result.length() - 2));
    log("```");
    ROOT_LOGGER.setLevel(Level.OFF);
    // Filter by property
    final Dataset<Row> filtered = df.filter("count > 1");
    result = filtered.showString(100, 20);
    ROOT_LOGGER.setLevel(Level.INFO);
    printJava("df.filter(\"count > 1\").show();");
    log("The results are:");
    log("```");
    log(result.substring(0, result.length() - 2));
    log("```");
    ROOT_LOGGER.setLevel(Level.OFF);
}
Also used : GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) User(uk.gov.gchq.gaffer.user.User) Row(org.apache.spark.sql.Row) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View)

Example 2 with GetDataFrameOfElements

use of uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements in project gaffer-doc by gchq.

the class GetDataFrameOfElementsExample method getDataFrameOfElementsWithEdgeGroup.

public void getDataFrameOfElementsWithEdgeGroup() {
    // ---------------------------------------------------------
    final GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder().view(new View.Builder().edge("edge").build()).build();
    // ---------------------------------------------------------
    final Dataset<Row> df = runExample(operation, null);
    // Restrict to edges involving given vertices
    final Dataset<Row> seeded = df.filter("src = 1 OR src = 3");
    String result = seeded.showString(100, 20);
    printJava("df.filter(\"src = 1 OR src = 3\").show();");
    print("The results are:\n");
    print("```");
    print(result.substring(0, result.length() - 2));
    print("```");
    // Filter by property
    final Dataset<Row> filtered = df.filter("count > 1");
    result = filtered.showString(100, 20);
    printJava("df.filter(\"count > 1\").show();");
    print("The results are:\n");
    print("```");
    print(result.substring(0, result.length() - 2));
    print("```");
}
Also used : GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) Row(org.apache.spark.sql.Row)

Example 3 with GetDataFrameOfElements

use of uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements in project Gaffer by gchq.

the class GetGraphFrameOfElementsHandler method doOperation.

@Override
public GraphFrame doOperation(final GetGraphFrameOfElements operation, final Context context, final Store store) throws OperationException {
    final GetDataFrameOfElements getDataFrame = new GetDataFrameOfElements.Builder().converters(operation.getConverters()).view(operation.getView()).options(operation.getOptions()).build();
    Dataset<Row> elements = store.execute(getDataFrame, context);
    elements = renameColumns(elements);
    elements.createOrReplaceTempView("elements");
    final String edgeGroups = groupsToString(operation.getView().getEdgeGroups());
    final String entityGroups = groupsToString(operation.getView().getEntityGroups());
    final SparkSession sparkSession = SparkContextUtil.getSparkSession(context, store.getProperties());
    // Create a DataFrame of Edges - must add an "id" column which we fill with
    // the row number. We add a partitionBy on group to avoid creating a single
    // partition for all data.
    Dataset<Row> edges = sparkSession.sql("select * from elements where group in " + edgeGroups).withColumn(SchemaToStructTypeConverter.ID, functions.row_number().over(Window.orderBy(SchemaToStructTypeConverter.GROUP).partitionBy(SchemaToStructTypeConverter.GROUP)));
    // Create a DataFrame of Entities
    Dataset<Row> entities = sparkSession.sql("select * from elements where " + SchemaToStructTypeConverter.GROUP + " in " + entityGroups);
    if (!edges.rdd().isEmpty()) {
        // We also add dummy entities for all vertices present in the edge dataset,
        // in case there are no corresponding Entities
        final Dataset<Row> sources = sparkSession.sql("select " + SchemaToStructTypeConverter.SRC_COL_NAME + " as " + SchemaToStructTypeConverter.VERTEX_COL_NAME + " from elements where " + SchemaToStructTypeConverter.GROUP + " in " + edgeGroups);
        final Dataset<Row> destinations = sparkSession.sql("select " + SchemaToStructTypeConverter.DST_COL_NAME + " as " + SchemaToStructTypeConverter.VERTEX_COL_NAME + " from elements where " + SchemaToStructTypeConverter.GROUP + " in " + edgeGroups);
        final Dataset<Row> vertices = sources.union(destinations).distinct();
        entities = DataFrameUtil.union(vertices, entities);
    } else {
        // If there are no edges, add an empty DataFrame
        edges = DataFrameUtil.emptyEdges(sparkSession);
    }
    return GraphFrame.apply(entities.withColumnRenamed(SchemaToStructTypeConverter.VERTEX_COL_NAME, SchemaToStructTypeConverter.ID), edges);
}
Also used : GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 4 with GetDataFrameOfElements

use of uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements in project Gaffer by gchq.

the class GetDataFrameOfElementsHandlerTest method checkGetExceptionIfIncompatibleSchemas.

@Test
public void checkGetExceptionIfIncompatibleSchemas() throws OperationException {
    final Graph graph = getGraph("/schema-DataFrame/elementsIncompatible.json", Collections.<Element>emptyList());
    // Use entity and edges group - check get correct data
    final GetDataFrameOfElements dfOperation = new GetDataFrameOfElements.Builder().view(new View.Builder().entity(ENTITY_GROUP).edge(EDGE_GROUP).build()).build();
    // is shut down.
    try {
        graph.execute(dfOperation, new User());
        fail("IllegalArgumentException should have been thrown");
    } catch (final IllegalArgumentException e) {
    // Expected
    }
}
Also used : GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) Graph(uk.gov.gchq.gaffer.graph.Graph) User(uk.gov.gchq.gaffer.user.User) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View) Test(org.junit.jupiter.api.Test)

Example 5 with GetDataFrameOfElements

use of uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements in project Gaffer by gchq.

the class GetDataFrameOfElementsHandlerTest method checkGetCorrectElementsInDataFrameWithProjectionAndFiltering.

@Test
public void checkGetCorrectElementsInDataFrameWithProjectionAndFiltering() throws OperationException {
    final Graph graph = getGraph("/schema-DataFrame/elements.json", getElements());
    // Get DataFrame
    final GetDataFrameOfElements dfOperation = new GetDataFrameOfElements.Builder().view(new View.Builder().edge(EDGE_GROUP).build()).build();
    final Dataset<Row> dataFrame = graph.execute(dfOperation, new User());
    // Check get correct rows when ask for all columns but only rows where property2 > 4.0
    Set<Row> results = new HashSet<>(dataFrame.filter("property2 > 4.0").collectAsList());
    final Set<Row> expectedRows = new HashSet<>();
    for (int i = 0; i < NUM_ELEMENTS; i++) {
        final MutableList<Object> fields = new MutableList<>();
        fields.appendElem(EDGE_GROUP);
        fields.appendElem("" + i);
        fields.appendElem("C");
        fields.appendElem(true);
        fields.appendElem(null);
        fields.appendElem(6);
        fields.appendElem(7);
        fields.appendElem(8.0F);
        fields.appendElem(9.0D);
        fields.appendElem(10L);
        fields.appendElem(i * 200L);
        expectedRows.add(Row$.MODULE$.fromSeq(fields));
    }
    assertEquals(expectedRows, results);
    // Check get correct rows when ask for columns property2 and property3 but only rows where property2 > 4.0
    results = new HashSet<>(dataFrame.select("property2", "property3").filter("property2 > 4.0").collectAsList());
    expectedRows.clear();
    for (int i = 0; i < NUM_ELEMENTS; i++) {
        final MutableList<Object> fields = new MutableList<>();
        fields.appendElem(8.0F);
        fields.appendElem(9.0D);
        expectedRows.add(Row$.MODULE$.fromSeq(fields));
    }
    assertEquals(expectedRows, results);
}
Also used : GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) User(uk.gov.gchq.gaffer.user.User) Graph(uk.gov.gchq.gaffer.graph.Graph) MutableList(scala.collection.mutable.MutableList) Row(org.apache.spark.sql.Row) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Aggregations

GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)13 Row (org.apache.spark.sql.Row)12 User (uk.gov.gchq.gaffer.user.User)10 Test (org.junit.jupiter.api.Test)8 View (uk.gov.gchq.gaffer.data.elementdefinition.view.View)8 Graph (uk.gov.gchq.gaffer.graph.Graph)8 HashSet (java.util.HashSet)7 MutableList (scala.collection.mutable.MutableList)7 HyperLogLogPlus (com.clearspring.analytics.stream.cardinality.HyperLogLogPlus)2 ArrayList (java.util.ArrayList)1 SparkSession (org.apache.spark.sql.SparkSession)1 ElementFilter (uk.gov.gchq.gaffer.data.element.function.ElementFilter)1 ViewElementDefinition (uk.gov.gchq.gaffer.data.elementdefinition.view.ViewElementDefinition)1 Converter (uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter)1 IsMoreThan (uk.gov.gchq.koryphe.impl.predicate.IsMoreThan)1