Search in sources :

Example 1 with Converter

use of uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter in project Gaffer by gchq.

the class GetDataFrameOfElementsHandlerTest method checkCanDealWithUserDefinedConversion.

@Test
public void checkCanDealWithUserDefinedConversion() throws OperationException {
    final Graph graph = getGraph("/schema-DataFrame/dataSchemaUserDefinedConversion.json", getElementsForUserDefinedConversion());
    final SQLContext sqlContext = getSqlContext("checkCanDealWithUserDefinedConversion");
    // Edges group - check get correct edges
    final List<Converter> converters = new ArrayList<>();
    converters.add(new MyPropertyConverter());
    GetDataFrameOfElements dfOperation = new GetDataFrameOfElements.Builder().sqlContext(sqlContext).view(new View.Builder().edge(EDGE_GROUP).build()).converters(converters).build();
    Dataset<Row> dataFrame = graph.execute(dfOperation, new User());
    Set<Row> results = new HashSet<>(dataFrame.collectAsList());
    final Set<Row> expectedRows = new HashSet<>();
    final MutableList<Object> fields1 = new MutableList<>();
    Map<String, Long> freqMap = Map$.MODULE$.empty();
    freqMap.put("Y", 1000L);
    freqMap.put("Z", 10000L);
    fields1.appendElem(EDGE_GROUP);
    fields1.appendElem("B");
    fields1.appendElem("C");
    fields1.appendElem(freqMap);
    final HyperLogLogPlus hllpp = new HyperLogLogPlus(5, 5);
    hllpp.offer("AAA");
    hllpp.offer("BBB");
    fields1.appendElem(hllpp.cardinality());
    fields1.appendElem(50);
    expectedRows.add(Row$.MODULE$.fromSeq(fields1));
    assertEquals(expectedRows, results);
    // Entities group - check get correct entities
    dfOperation = new GetDataFrameOfElements.Builder().sqlContext(sqlContext).view(new View.Builder().entity(ENTITY_GROUP).build()).converters(converters).build();
    dataFrame = graph.execute(dfOperation, new User());
    results.clear();
    results.addAll(dataFrame.collectAsList());
    expectedRows.clear();
    fields1.clear();
    freqMap.clear();
    freqMap.put("W", 10L);
    freqMap.put("X", 100L);
    fields1.appendElem(ENTITY_GROUP);
    fields1.appendElem("A");
    fields1.appendElem(freqMap);
    final HyperLogLogPlus hllpp2 = new HyperLogLogPlus(5, 5);
    hllpp2.offer("AAA");
    fields1.appendElem(hllpp2.cardinality());
    fields1.appendElem(10);
    expectedRows.add(Row$.MODULE$.fromSeq(fields1));
    assertEquals(expectedRows, results);
    sqlContext.sparkContext().stop();
}
Also used : GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) User(uk.gov.gchq.gaffer.user.User) ArrayList(java.util.ArrayList) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View) Graph(uk.gov.gchq.gaffer.graph.Graph) MutableList(scala.collection.mutable.MutableList) HyperLogLogPlus(com.clearspring.analytics.stream.cardinality.HyperLogLogPlus) Converter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 2 with Converter

use of uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter in project Gaffer by gchq.

the class SchemaToStructTypeConverter method buildSchema.

private void buildSchema() {
    LOGGER.info("Building Spark SQL schema for groups {}", StringUtils.join(groups, ','));
    for (final String group : groups) {
        final SchemaElementDefinition elementDefn = schema.getElement(group);
        final List<StructField> structFieldList = new ArrayList<>();
        if (elementDefn instanceof SchemaEntityDefinition) {
            entityOrEdgeByGroup.put(group, EntityOrEdge.ENTITY);
            final SchemaEntityDefinition entityDefinition = (SchemaEntityDefinition) elementDefn;
            final String vertexClass = schema.getType(entityDefinition.getVertex()).getClassString();
            final DataType vertexType = getType(vertexClass);
            if (vertexType == null) {
                throw new RuntimeException("Vertex must be a recognised type: found " + vertexClass);
            }
            LOGGER.info("Group {} is an entity group - {} is of type {}", group, VERTEX_COL_NAME, vertexType);
            structFieldList.add(new StructField(VERTEX_COL_NAME, vertexType, true, Metadata.empty()));
        } else {
            entityOrEdgeByGroup.put(group, EntityOrEdge.EDGE);
            final SchemaEdgeDefinition edgeDefinition = (SchemaEdgeDefinition) elementDefn;
            final String srcClass = schema.getType(edgeDefinition.getSource()).getClassString();
            final String dstClass = schema.getType(edgeDefinition.getDestination()).getClassString();
            final DataType srcType = getType(srcClass);
            final DataType dstType = getType(dstClass);
            if (srcType == null || dstType == null) {
                throw new RuntimeException("Both source and destination must be recognised types: source was " + srcClass + " destination was " + dstClass);
            }
            LOGGER.info("Group {} is an edge group - {} is of type {}, {} is of type {}", group, SRC_COL_NAME, srcType, DST_COL_NAME, dstType);
            structFieldList.add(new StructField(SRC_COL_NAME, srcType, true, Metadata.empty()));
            structFieldList.add(new StructField(DST_COL_NAME, dstType, true, Metadata.empty()));
        }
        final Set<String> properties = elementDefn.getProperties();
        for (final String property : properties) {
            // Check if property is of a known type that can be handled by default
            final String propertyClass = elementDefn.getPropertyClass(property).getCanonicalName();
            DataType propertyType = getType(propertyClass);
            if (propertyType != null) {
                propertyNeedsConversion.put(property, needsConversion(propertyClass));
                structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
                LOGGER.info("Property {} is of type {}", property, propertyType);
            } else {
                // Check if any of the provided converters can handle it
                if (converters != null) {
                    for (final Converter converter : converters) {
                        if (converter.canHandle(elementDefn.getPropertyClass(property))) {
                            propertyNeedsConversion.put(property, true);
                            propertyType = converter.convertedType();
                            converterByProperty.put(property, converter);
                            structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
                            LOGGER.info("Property {} of type {} will be converted by {} to {}", property, propertyClass, converter.getClass().getName(), propertyType);
                            break;
                        }
                    }
                    if (propertyType == null) {
                        LOGGER.warn("Ignoring property {} as it is not a recognised type and none of the provided " + "converters can handle it", property);
                    }
                }
            }
        }
        structTypeByGroup.put(group, new StructType(structFieldList.toArray(new StructField[structFieldList.size()])));
    }
    // Create reverse map of field name to StructField
    final Map<String, Set<StructField>> fieldToStructs = new HashMap<>();
    for (final String group : groups) {
        final StructType groupSchema = structTypeByGroup.get(group);
        for (final String field : groupSchema.fieldNames()) {
            if (fieldToStructs.get(field) == null) {
                fieldToStructs.put(field, new HashSet<StructField>());
            }
            fieldToStructs.get(field).add(groupSchema.apply(field));
        }
    }
    // Check consistency, i.e. if the same field appears in multiple groups then the types are consistent
    for (final Entry<String, Set<StructField>> entry : fieldToStructs.entrySet()) {
        final Set<StructField> schemas = entry.getValue();
        if (schemas.size() > 1) {
            throw new IllegalArgumentException("Inconsistent fields: the field " + entry.getKey() + " has more than one definition: " + StringUtils.join(schemas, ','));
        }
    }
    // Merge schemas for groups together - fields should appear in the order the groups were provided
    final LinkedHashSet<StructField> fields = new LinkedHashSet<>();
    fields.add(new StructField(GROUP, DataTypes.StringType, false, Metadata.empty()));
    usedProperties.add(GROUP);
    for (final String group : groups) {
        final StructType groupSchema = structTypeByGroup.get(group);
        for (final String field : groupSchema.fieldNames()) {
            final StructField struct = groupSchema.apply(field);
            // Add struct to fields unless it has already been added
            if (!fields.contains(struct)) {
                fields.add(struct);
                usedProperties.add(field);
            }
        }
    }
    structType = new StructType(fields.toArray(new StructField[fields.size()]));
    LOGGER.info("Schema is {}", structType);
    LOGGER.debug("properties -> conversion: {}", StringUtils.join(propertyNeedsConversion.entrySet(), ','));
}
Also used : LinkedHashSet(java.util.LinkedHashSet) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SchemaEntityDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEntityDefinition) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) SchemaEdgeDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEdgeDefinition) Converter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter) HyperLogLogPlusConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.HyperLogLogPlusConverter) FreqMapConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.FreqMapConverter) UnionConverter(uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.datasketches.theta.UnionConverter) SchemaElementDefinition(uk.gov.gchq.gaffer.store.schema.SchemaElementDefinition)

Aggregations

ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 Converter (uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter)2 HyperLogLogPlus (com.clearspring.analytics.stream.cardinality.HyperLogLogPlus)1 HashMap (java.util.HashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 Set (java.util.Set)1 Row (org.apache.spark.sql.Row)1 SQLContext (org.apache.spark.sql.SQLContext)1 DataType (org.apache.spark.sql.types.DataType)1 StructField (org.apache.spark.sql.types.StructField)1 StructType (org.apache.spark.sql.types.StructType)1 Test (org.junit.Test)1 MutableList (scala.collection.mutable.MutableList)1 View (uk.gov.gchq.gaffer.data.elementdefinition.view.View)1 Graph (uk.gov.gchq.gaffer.graph.Graph)1 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)1 FreqMapConverter (uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.FreqMapConverter)1 HyperLogLogPlusConverter (uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.HyperLogLogPlusConverter)1 UnionConverter (uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.impl.datasketches.theta.UnionConverter)1