Search in sources :

Example 1 with GetRDDOfAllElements

use of uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements in project Gaffer by gchq.

the class ImportKeyValuePairRDDToAccumuloHandlerTest method checkImportRDDOfElements.

@Test
public void checkImportRDDOfElements() throws OperationException, IOException {
    final Graph graph1 = new Graph.Builder().addSchema(getClass().getResourceAsStream("/schema/dataSchema.json")).addSchema(getClass().getResourceAsStream("/schema/dataTypes.json")).addSchema(getClass().getResourceAsStream("/schema/storeTypes.json")).addSchema(getClass().getResourceAsStream("/schema/storeSchema.json")).storeProperties(getClass().getResourceAsStream("/store.properties")).build();
    final ArrayBuffer<Element> elements = new ArrayBuffer<>();
    for (int i = 0; i < 10; i++) {
        final Entity entity = new Entity(TestGroups.ENTITY);
        entity.setVertex("" + i);
        final Edge edge1 = new Edge(TestGroups.EDGE);
        edge1.setSource("" + i);
        edge1.setDestination("B");
        edge1.setDirected(false);
        edge1.putProperty(TestPropertyNames.COUNT, 2);
        final Edge edge2 = new Edge(TestGroups.EDGE);
        edge2.setSource("" + i);
        edge2.setDestination("C");
        edge2.setDirected(false);
        edge2.putProperty(TestPropertyNames.COUNT, 4);
        elements.$plus$eq(edge1);
        elements.$plus$eq(edge2);
        elements.$plus$eq(entity);
    }
    final User user = new User();
    final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("tests").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator", "uk.gov.gchq.gaffer.spark.serialisation.kryo.Registrator").set("spark.driver.allowMultipleContexts", "true");
    final SparkContext sparkContext = new SparkContext(sparkConf);
    // Create Hadoop configuration and serialise to a string
    final Configuration configuration = new Configuration();
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    configuration.write(new DataOutputStream(baos));
    final String configurationString = new String(baos.toByteArray(), CommonConstants.UTF_8);
    final String outputPath = this.getClass().getResource("/").getPath().toString() + "load";
    final String failurePath = this.getClass().getResource("/").getPath().toString() + "failure";
    final File file = new File(outputPath);
    if (file.exists()) {
        FileUtils.forceDelete(file);
    }
    final ElementConverterFunction func = new ElementConverterFunction(sparkContext.broadcast(new ByteEntityAccumuloElementConverter(graph1.getSchema()), ACCUMULO_ELEMENT_CONVERTER_CLASS_TAG));
    final RDD<Tuple2<Key, Value>> elementRDD = sparkContext.parallelize(elements, 1, ELEMENT_CLASS_TAG).flatMap(func, TUPLE2_CLASS_TAG);
    final ImportKeyValuePairRDDToAccumulo addRdd = new ImportKeyValuePairRDDToAccumulo.Builder().input(elementRDD).outputPath(outputPath).failurePath(failurePath).build();
    graph1.execute(addRdd, user);
    FileUtils.forceDelete(file);
    // Check all elements were added
    final GetRDDOfAllElements rddQuery = new GetRDDOfAllElements.Builder().sparkContext(sparkContext).option(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString).build();
    final RDD<Element> rdd = graph1.execute(rddQuery, user);
    if (rdd == null) {
        fail("No RDD returned");
    }
    final Set<Element> results = new HashSet<>();
    final Element[] returnedElements = (Element[]) rdd.collect();
    Collections.addAll(results, returnedElements);
    assertEquals(elements.size(), results.size());
    sparkContext.stop();
}
Also used : Entity(uk.gov.gchq.gaffer.data.element.Entity) User(uk.gov.gchq.gaffer.user.User) Configuration(org.apache.hadoop.conf.Configuration) DataOutputStream(java.io.DataOutputStream) Element(uk.gov.gchq.gaffer.data.element.Element) GetRDDOfAllElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements) ImportKeyValuePairRDDToAccumulo(uk.gov.gchq.gaffer.sparkaccumulo.operation.scalardd.ImportKeyValuePairRDDToAccumulo) HashSet(java.util.HashSet) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) Graph(uk.gov.gchq.gaffer.graph.Graph) SparkContext(org.apache.spark.SparkContext) Tuple2(scala.Tuple2) ArrayBuffer(scala.collection.mutable.ArrayBuffer) ByteEntityAccumuloElementConverter(uk.gov.gchq.gaffer.accumulostore.key.core.impl.byteEntity.ByteEntityAccumuloElementConverter) ElementConverterFunction(uk.gov.gchq.gaffer.sparkaccumulo.operation.utils.scala.ElementConverterFunction) Edge(uk.gov.gchq.gaffer.data.element.Edge) SparkConf(org.apache.spark.SparkConf) File(java.io.File) Test(org.junit.Test)

Example 2 with GetRDDOfAllElements

use of uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements in project Gaffer by gchq.

the class ImportRDDOfElementsHandlerTest method checkImportRDDOfElements.

@Test
public void checkImportRDDOfElements() throws OperationException, IOException {
    final Graph graph1 = new Graph.Builder().addSchema(getClass().getResourceAsStream("/schema/dataSchema.json")).addSchema(getClass().getResourceAsStream("/schema/dataTypes.json")).addSchema(getClass().getResourceAsStream("/schema/storeTypes.json")).storeProperties(getClass().getResourceAsStream("/store.properties")).build();
    final ArrayBuffer<Element> elements = new ArrayBuffer<>();
    for (int i = 0; i < 10; i++) {
        final Entity entity = new Entity(TestGroups.ENTITY);
        entity.setVertex("" + i);
        final Edge edge1 = new Edge(TestGroups.EDGE);
        edge1.setSource("" + i);
        edge1.setDestination("B");
        edge1.setDirected(false);
        edge1.putProperty(TestPropertyNames.COUNT, 2);
        final Edge edge2 = new Edge(TestGroups.EDGE);
        edge2.setSource("" + i);
        edge2.setDestination("C");
        edge2.setDirected(false);
        edge2.putProperty(TestPropertyNames.COUNT, 4);
        elements.$plus$eq(edge1);
        elements.$plus$eq(edge2);
        elements.$plus$eq(entity);
    }
    final User user = new User();
    final SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("tests").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator", "uk.gov.gchq.gaffer.spark.serialisation.kryo.Registrator").set("spark.driver.allowMultipleContexts", "true");
    final SparkContext sparkContext = new SparkContext(sparkConf);
    // Create Hadoop configuration and serialise to a string
    final Configuration configuration = new Configuration();
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    configuration.write(new DataOutputStream(baos));
    final String configurationString = new String(baos.toByteArray(), CommonConstants.UTF_8);
    final String outputPath = this.getClass().getResource("/").getPath().toString() + "load";
    final String failurePath = this.getClass().getResource("/").getPath().toString() + "failure";
    final File file = new File(outputPath);
    if (file.exists()) {
        FileUtils.forceDelete(file);
    }
    final RDD<Element> elementRDD = sparkContext.parallelize(elements, 8, ELEMENT_CLASS_TAG);
    final ImportRDDOfElements addRdd = new ImportRDDOfElements.Builder().sparkContext(sparkContext).input(elementRDD).option("outputPath", outputPath).option("failurePath", failurePath).build();
    graph1.execute(addRdd, user);
    FileUtils.forceDelete(file);
    // Check all elements were added
    final GetRDDOfAllElements rddQuery = new GetRDDOfAllElements.Builder().sparkContext(sparkContext).option(AbstractGetRDDHandler.HADOOP_CONFIGURATION_KEY, configurationString).build();
    final RDD<Element> rdd = graph1.execute(rddQuery, user);
    if (rdd == null) {
        fail("No RDD returned");
    }
    final Set<Element> results = new HashSet<>();
    final Element[] returnedElements = (Element[]) rdd.collect();
    for (int i = 0; i < returnedElements.length; i++) {
        results.add(returnedElements[i]);
    }
    assertEquals(elements.size(), results.size());
    sparkContext.stop();
}
Also used : Entity(uk.gov.gchq.gaffer.data.element.Entity) User(uk.gov.gchq.gaffer.user.User) Configuration(org.apache.hadoop.conf.Configuration) DataOutputStream(java.io.DataOutputStream) Element(uk.gov.gchq.gaffer.data.element.Element) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) Graph(uk.gov.gchq.gaffer.graph.Graph) SparkContext(org.apache.spark.SparkContext) ImportRDDOfElements(uk.gov.gchq.gaffer.spark.operation.scalardd.ImportRDDOfElements) GetRDDOfAllElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements) ArrayBuffer(scala.collection.mutable.ArrayBuffer) Edge(uk.gov.gchq.gaffer.data.element.Edge) SparkConf(org.apache.spark.SparkConf) File(java.io.File) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with GetRDDOfAllElements

use of uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements in project Gaffer by gchq.

the class AccumuloStoreRelation method buildScan.

/**
     * Creates a <code>DataFrame</code> of all {@link Element}s from the specified groups with columns that are not
     * required filtered out.
     * <p>
     * Currently this does not push the projection down to the store (i.e. it should be implemented in an iterator,
     * not in the transform). Issue 320 refers to this.
     *
     * @param requiredColumns The columns to return.
     * @return An {@link RDD} of {@link Row}s containing the requested columns.
     */
@Override
public RDD<Row> buildScan(final String[] requiredColumns) {
    try {
        LOGGER.info("Building scan with required columns: {}", StringUtils.join(requiredColumns, ','));
        LOGGER.info("Building GetRDDOfAllElements with view set to groups {}", StringUtils.join(groups, ','));
        final GetRDDOfAllElements operation = new GetRDDOfAllElements(sqlContext.sparkContext());
        operation.setView(view);
        final RDD<Element> rdd = store.execute(operation, user);
        return rdd.map(new ConvertElementToRow(new LinkedHashSet<>(Arrays.asList(requiredColumns)), propertyNeedsConversion, converterByProperty), ClassTagConstants.ROW_CLASS_TAG);
    } catch (final OperationException e) {
        LOGGER.error("OperationException while executing operation {}", e);
        return null;
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) GetRDDOfAllElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements) Element(uk.gov.gchq.gaffer.data.element.Element) OperationException(uk.gov.gchq.gaffer.operation.OperationException) ConvertElementToRow(uk.gov.gchq.gaffer.spark.operation.dataframe.ConvertElementToRow)

Example 4 with GetRDDOfAllElements

use of uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements in project Gaffer by gchq.

the class FilterToOperationConverterTest method testSingleGroup.

@Test
public void testSingleGroup() throws OperationException {
    final Schema schema = getSchema();
    final SQLContext sqlContext = getSqlContext("testSingleGroup");
    final Filter[] filters = new Filter[1];
    filters[0] = new EqualTo(SchemaToStructTypeConverter.GROUP, ENTITY_GROUP);
    final FiltersToOperationConverter converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
    final AbstractGetRDD<?> operation = converter.getOperation();
    assertTrue(operation instanceof GetRDDOfAllElements);
    assertEquals(Collections.singleton(ENTITY_GROUP), operation.getView().getEntityGroups());
    assertEquals(0, operation.getView().getEdgeGroups().size());
    sqlContext.sparkContext().stop();
}
Also used : Filter(org.apache.spark.sql.sources.Filter) GetRDDOfAllElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements) Schema(uk.gov.gchq.gaffer.store.schema.Schema) SQLContext(org.apache.spark.sql.SQLContext) EqualTo(org.apache.spark.sql.sources.EqualTo) Test(org.junit.Test)

Example 5 with GetRDDOfAllElements

use of uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements in project Gaffer by gchq.

the class FilterToOperationConverterTest method testSpecifyPropertyFilters.

@Test
public void testSpecifyPropertyFilters() throws OperationException {
    final Schema schema = getSchema();
    final SQLContext sqlContext = getSqlContext("testSpecifyPropertyFilters");
    final Filter[] filters = new Filter[1];
    // GreaterThan
    filters[0] = new GreaterThan("property1", 5);
    FiltersToOperationConverter converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
    AbstractGetRDD<?> operation = converter.getOperation();
    assertTrue(operation instanceof GetRDDOfAllElements);
    View opView = operation.getView();
    List<ConsumerFunctionContext<String, FilterFunction>> entityPostAggFilters = opView.getEntity(ENTITY_GROUP).getPostAggregationFilterFunctions();
    assertEquals(1, entityPostAggFilters.size());
    assertEquals(new ArrayList<>(Collections.singleton("property1")), entityPostAggFilters.get(0).getSelection());
    assertEquals(new IsMoreThan(5, false), entityPostAggFilters.get(0).getFunction());
    for (final String edgeGroup : EDGE_GROUPS) {
        final List<ConsumerFunctionContext<String, FilterFunction>> edgePostAggFilters = opView.getEdge(edgeGroup).getPostAggregationFilterFunctions();
        assertEquals(1, edgePostAggFilters.size());
        assertEquals(new ArrayList<>(Collections.singleton("property1")), edgePostAggFilters.get(0).getSelection());
        assertEquals(new IsMoreThan(5, false), edgePostAggFilters.get(0).getFunction());
    }
    // LessThan
    filters[0] = new LessThan("property4", 8L);
    converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
    operation = converter.getOperation();
    assertTrue(operation instanceof GetRDDOfAllElements);
    // Only groups ENTITY_GROUP and EDGE_GROUP should be in the view as only they have property4
    opView = operation.getView();
    entityPostAggFilters = opView.getEntity(ENTITY_GROUP).getPostAggregationFilterFunctions();
    assertEquals(1, entityPostAggFilters.size());
    assertEquals(new ArrayList<>(Collections.singleton("property4")), entityPostAggFilters.get(0).getSelection());
    assertEquals(new IsLessThan(8L, false), entityPostAggFilters.get(0).getFunction());
    List<ConsumerFunctionContext<String, FilterFunction>> edgePostAggFilters = opView.getEdge(EDGE_GROUP).getPostAggregationFilterFunctions();
    assertEquals(1, edgePostAggFilters.size());
    assertEquals(new ArrayList<>(Collections.singleton("property4")), edgePostAggFilters.get(0).getSelection());
    assertEquals(new IsLessThan(8L, false), edgePostAggFilters.get(0).getFunction());
    // And
    final Filter left = new GreaterThan("property1", 5);
    final Filter right = new GreaterThan("property4", 8L);
    filters[0] = new And(left, right);
    converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
    operation = converter.getOperation();
    assertTrue(operation instanceof GetRDDOfAllElements);
    // Only groups ENTITY_GROUP and EDGE_GROUP should be in the view as only they have property1 and property4
    opView = operation.getView();
    entityPostAggFilters = opView.getEntity(ENTITY_GROUP).getPostAggregationFilterFunctions();
    assertEquals(2, entityPostAggFilters.size());
    final ArrayList<String> expectedProperties = new ArrayList<>();
    expectedProperties.add("property1");
    expectedProperties.add("property4");
    assertEquals(1, entityPostAggFilters.get(0).getSelection().size());
    assertEquals(expectedProperties.get(0), entityPostAggFilters.get(0).getSelection().get(0));
    assertEquals(1, entityPostAggFilters.get(1).getSelection().size());
    assertEquals(expectedProperties.get(1), entityPostAggFilters.get(1).getSelection().get(0));
    final ArrayList<FilterFunction> expectedFunctions = new ArrayList<>();
    expectedFunctions.add(new IsMoreThan(5, false));
    expectedFunctions.add(new IsMoreThan(8L, false));
    assertEquals(expectedFunctions.get(0), entityPostAggFilters.get(0).getFunction());
    assertEquals(expectedFunctions.get(1), entityPostAggFilters.get(1).getFunction());
    edgePostAggFilters = opView.getEdge(EDGE_GROUP).getPostAggregationFilterFunctions();
    assertEquals(2, edgePostAggFilters.size());
    assertEquals(1, edgePostAggFilters.get(0).getSelection().size());
    assertEquals(expectedProperties.get(0), edgePostAggFilters.get(0).getSelection().get(0));
    assertEquals(1, edgePostAggFilters.get(1).getSelection().size());
    assertEquals(expectedProperties.get(1), edgePostAggFilters.get(1).getSelection().get(0));
    sqlContext.sparkContext().stop();
}
Also used : FilterFunction(uk.gov.gchq.gaffer.function.FilterFunction) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View) LessThan(org.apache.spark.sql.sources.LessThan) IsLessThan(uk.gov.gchq.gaffer.function.filter.IsLessThan) ConsumerFunctionContext(uk.gov.gchq.gaffer.function.context.ConsumerFunctionContext) IsLessThan(uk.gov.gchq.gaffer.function.filter.IsLessThan) Filter(org.apache.spark.sql.sources.Filter) GreaterThan(org.apache.spark.sql.sources.GreaterThan) GetRDDOfAllElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements) And(org.apache.spark.sql.sources.And) IsMoreThan(uk.gov.gchq.gaffer.function.filter.IsMoreThan) SQLContext(org.apache.spark.sql.SQLContext) Test(org.junit.Test)

Aggregations

GetRDDOfAllElements (uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements)9 Test (org.junit.Test)6 Filter (org.apache.spark.sql.sources.Filter)5 SQLContext (org.apache.spark.sql.SQLContext)4 Element (uk.gov.gchq.gaffer.data.element.Element)4 Schema (uk.gov.gchq.gaffer.store.schema.Schema)4 EqualTo (org.apache.spark.sql.sources.EqualTo)3 View (uk.gov.gchq.gaffer.data.elementdefinition.view.View)3 DataOutputStream (java.io.DataOutputStream)2 File (java.io.File)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 ByteArrayOutputStream (org.apache.commons.io.output.ByteArrayOutputStream)2 Configuration (org.apache.hadoop.conf.Configuration)2 SparkConf (org.apache.spark.SparkConf)2 SparkContext (org.apache.spark.SparkContext)2 GreaterThan (org.apache.spark.sql.sources.GreaterThan)2 LessThan (org.apache.spark.sql.sources.LessThan)2 ArrayBuffer (scala.collection.mutable.ArrayBuffer)2 Edge (uk.gov.gchq.gaffer.data.element.Edge)2