use of uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter in project Gaffer by gchq.
the class GetDataFrameOfElementsHandlerTest method checkCanDealWithUserDefinedConversion.
@Test
public void checkCanDealWithUserDefinedConversion() throws OperationException {
final Graph graph = getGraph("/schema-DataFrame/dataSchemaUserDefinedConversion.json", getElementsForUserDefinedConversion());
final SQLContext sqlContext = getSqlContext("checkCanDealWithUserDefinedConversion");
// Edges group - check get correct edges
final List<Converter> converters = new ArrayList<>();
converters.add(new MyPropertyConverter());
GetDataFrameOfElements dfOperation = new GetDataFrameOfElements.Builder().sqlContext(sqlContext).view(new View.Builder().edge(EDGE_GROUP).build()).converters(converters).build();
Dataset<Row> dataFrame = graph.execute(dfOperation, new User());
Set<Row> results = new HashSet<>(dataFrame.collectAsList());
final Set<Row> expectedRows = new HashSet<>();
final MutableList<Object> fields1 = new MutableList<>();
Map<String, Long> freqMap = Map$.MODULE$.empty();
freqMap.put("Y", 1000L);
freqMap.put("Z", 10000L);
fields1.appendElem(EDGE_GROUP);
fields1.appendElem("B");
fields1.appendElem("C");
fields1.appendElem(freqMap);
final HyperLogLogPlus hllpp = new HyperLogLogPlus(5, 5);
hllpp.offer("AAA");
hllpp.offer("BBB");
fields1.appendElem(hllpp.cardinality());
fields1.appendElem(50);
expectedRows.add(Row$.MODULE$.fromSeq(fields1));
assertEquals(expectedRows, results);
// Entities group - check get correct entities
dfOperation = new GetDataFrameOfElements.Builder().sqlContext(sqlContext).view(new View.Builder().entity(ENTITY_GROUP).build()).converters(converters).build();
dataFrame = graph.execute(dfOperation, new User());
results.clear();
results.addAll(dataFrame.collectAsList());
expectedRows.clear();
fields1.clear();
freqMap.clear();
freqMap.put("W", 10L);
freqMap.put("X", 100L);
fields1.appendElem(ENTITY_GROUP);
fields1.appendElem("A");
fields1.appendElem(freqMap);
final HyperLogLogPlus hllpp2 = new HyperLogLogPlus(5, 5);
hllpp2.offer("AAA");
fields1.appendElem(hllpp2.cardinality());
fields1.appendElem(10);
expectedRows.add(Row$.MODULE$.fromSeq(fields1));
assertEquals(expectedRows, results);
sqlContext.sparkContext().stop();
}
use of uk.gov.gchq.gaffer.spark.operation.dataframe.converter.property.Converter in project Gaffer by gchq.
the class SchemaToStructTypeConverter method buildSchema.
private void buildSchema() {
LOGGER.info("Building Spark SQL schema for groups {}", StringUtils.join(groups, ','));
for (final String group : groups) {
final SchemaElementDefinition elementDefn = schema.getElement(group);
final List<StructField> structFieldList = new ArrayList<>();
if (elementDefn instanceof SchemaEntityDefinition) {
entityOrEdgeByGroup.put(group, EntityOrEdge.ENTITY);
final SchemaEntityDefinition entityDefinition = (SchemaEntityDefinition) elementDefn;
final String vertexClass = schema.getType(entityDefinition.getVertex()).getClassString();
final DataType vertexType = getType(vertexClass);
if (vertexType == null) {
throw new RuntimeException("Vertex must be a recognised type: found " + vertexClass);
}
LOGGER.info("Group {} is an entity group - {} is of type {}", group, VERTEX_COL_NAME, vertexType);
structFieldList.add(new StructField(VERTEX_COL_NAME, vertexType, true, Metadata.empty()));
} else {
entityOrEdgeByGroup.put(group, EntityOrEdge.EDGE);
final SchemaEdgeDefinition edgeDefinition = (SchemaEdgeDefinition) elementDefn;
final String srcClass = schema.getType(edgeDefinition.getSource()).getClassString();
final String dstClass = schema.getType(edgeDefinition.getDestination()).getClassString();
final DataType srcType = getType(srcClass);
final DataType dstType = getType(dstClass);
if (srcType == null || dstType == null) {
throw new RuntimeException("Both source and destination must be recognised types: source was " + srcClass + " destination was " + dstClass);
}
LOGGER.info("Group {} is an edge group - {} is of type {}, {} is of type {}", group, SRC_COL_NAME, srcType, DST_COL_NAME, dstType);
structFieldList.add(new StructField(SRC_COL_NAME, srcType, true, Metadata.empty()));
structFieldList.add(new StructField(DST_COL_NAME, dstType, true, Metadata.empty()));
}
final Set<String> properties = elementDefn.getProperties();
for (final String property : properties) {
// Check if property is of a known type that can be handled by default
final String propertyClass = elementDefn.getPropertyClass(property).getCanonicalName();
DataType propertyType = getType(propertyClass);
if (propertyType != null) {
propertyNeedsConversion.put(property, needsConversion(propertyClass));
structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
LOGGER.info("Property {} is of type {}", property, propertyType);
} else {
// Check if any of the provided converters can handle it
if (converters != null) {
for (final Converter converter : converters) {
if (converter.canHandle(elementDefn.getPropertyClass(property))) {
propertyNeedsConversion.put(property, true);
propertyType = converter.convertedType();
converterByProperty.put(property, converter);
structFieldList.add(new StructField(property, propertyType, true, Metadata.empty()));
LOGGER.info("Property {} of type {} will be converted by {} to {}", property, propertyClass, converter.getClass().getName(), propertyType);
break;
}
}
if (propertyType == null) {
LOGGER.warn("Ignoring property {} as it is not a recognised type and none of the provided " + "converters can handle it", property);
}
}
}
}
structTypeByGroup.put(group, new StructType(structFieldList.toArray(new StructField[structFieldList.size()])));
}
// Create reverse map of field name to StructField
final Map<String, Set<StructField>> fieldToStructs = new HashMap<>();
for (final String group : groups) {
final StructType groupSchema = structTypeByGroup.get(group);
for (final String field : groupSchema.fieldNames()) {
if (fieldToStructs.get(field) == null) {
fieldToStructs.put(field, new HashSet<StructField>());
}
fieldToStructs.get(field).add(groupSchema.apply(field));
}
}
// Check consistency, i.e. if the same field appears in multiple groups then the types are consistent
for (final Entry<String, Set<StructField>> entry : fieldToStructs.entrySet()) {
final Set<StructField> schemas = entry.getValue();
if (schemas.size() > 1) {
throw new IllegalArgumentException("Inconsistent fields: the field " + entry.getKey() + " has more than one definition: " + StringUtils.join(schemas, ','));
}
}
// Merge schemas for groups together - fields should appear in the order the groups were provided
final LinkedHashSet<StructField> fields = new LinkedHashSet<>();
fields.add(new StructField(GROUP, DataTypes.StringType, false, Metadata.empty()));
usedProperties.add(GROUP);
for (final String group : groups) {
final StructType groupSchema = structTypeByGroup.get(group);
for (final String field : groupSchema.fieldNames()) {
final StructField struct = groupSchema.apply(field);
// Add struct to fields unless it has already been added
if (!fields.contains(struct)) {
fields.add(struct);
usedProperties.add(field);
}
}
}
structType = new StructType(fields.toArray(new StructField[fields.size()]));
LOGGER.info("Schema is {}", structType);
LOGGER.debug("properties -> conversion: {}", StringUtils.join(propertyNeedsConversion.entrySet(), ','));
}
Aggregations