Search in sources :

Example 1 with GenericRowWithSchema

use of org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema in project Gaffer by gchq.

the class AggregateGafferRowsFunctionTest method mergeEntityRowsTest.

@Test
public void mergeEntityRowsTest() throws OperationException, IOException {
    final String group = "BasicEntity";
    final SchemaElementDefinition elementSchema = utils.getGafferSchema().getElement(group);
    final GafferGroupObjectConverter converter = utils.getConverter(group);
    final String[] gafferProperties = new String[elementSchema.getProperties().size()];
    elementSchema.getProperties().toArray(gafferProperties);
    final byte[] aggregatorJson = JSONSerialiser.serialise(elementSchema.getIngestAggregator());
    final AggregateGafferRowsFunction aggregator = new AggregateGafferRowsFunction(gafferProperties, true, elementSchema.getGroupBy(), utils.getColumnToPaths(group), aggregatorJson, converter);
    final GenericRowWithSchema row1 = DataGen.generateEntityRow(utils, group, "vertex", (byte) 'a', 0.2, 3f, TestUtils.getTreeSet1(), 5L, (short) 6, TestUtils.DATE, TestUtils.getFreqMap1(), null);
    final GenericRowWithSchema row2 = DataGen.generateEntityRow(utils, group, "vertex", (byte) 'c', 0.7, 4f, TestUtils.getTreeSet2(), 7L, (short) 4, TestUtils.DATE, TestUtils.getFreqMap2(), null);
    final Row merged = aggregator.call(row1, row2);
    final List<Object> actual = new ArrayList<>(11);
    for (int i = 0; i < merged.length(); i++) {
        actual.add(merged.apply(i));
    }
    final List<Object> expected = new ArrayList<>(11);
    expected.add("vertex");
    expected.add(new byte[] { (byte) 'c' });
    expected.add(0.8999999999999999);
    expected.add(7f);
    expected.add(new String[] { "A", "B", "C" });
    expected.add(12L);
    expected.add(10);
    expected.add(TestUtils.DATE.getTime());
    expected.add(JavaConversions$.MODULE$.mapAsScalaMap(TestUtils.MERGED_FREQMAP));
    expected.add(2);
    assertThat(expected).containsExactly(actual.toArray());
}
Also used : GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) AggregateGafferRowsFunction(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateGafferRowsFunction) SchemaElementDefinition(uk.gov.gchq.gaffer.store.schema.SchemaElementDefinition) Test(org.junit.jupiter.api.Test)

Example 2 with GenericRowWithSchema

use of org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema in project Gaffer by gchq.

the class DataGen method generateEntityRow.

public static GenericRowWithSchema generateEntityRow(final SchemaUtils utils, final String group, final String vertex, final Byte aByte, final Double aDouble, final Float aFloat, final TreeSet<String> treeSet, final Long aLong, final Short aShort, final Date date, final FreqMap freqMap, final String visibility) throws SerialisationException {
    final GafferGroupObjectConverter entityConverter = new GafferGroupObjectConverter(group, utils.getCoreProperties(group), utils.getCorePropertiesForReversedEdges(), utils.getColumnToSerialiser(group), utils.getSerialisers(), utils.getColumnToPaths(group));
    final List<Object> list = new ArrayList<>();
    final scala.collection.mutable.Map<String, Long> map = new scala.collection.mutable.HashMap<>();
    for (final Map.Entry<String, Long> entry : freqMap.entrySet()) {
        map.put(entry.getKey(), entry.getValue());
    }
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects(ParquetStore.VERTEX, vertex)));
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects("byte", aByte)));
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects("double", aDouble)));
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects("float", aFloat)));
    list.add(WrappedArray$.MODULE$.make(entityConverter.gafferObjectToParquetObjects("treeSet", treeSet)[0]));
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects("long", aLong)));
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects("short", aShort)));
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects("date", date)));
    list.add(map);
    list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects("count", 1)));
    if (null != visibility) {
        list.addAll(Arrays.asList(entityConverter.gafferObjectToParquetObjects(TestTypes.VISIBILITY, visibility)));
    }
    final Object[] objects = new Object[list.size()];
    list.toArray(objects);
    return new GenericRowWithSchema(objects, utils.getSparkSchema(group));
}
Also used : ArrayList(java.util.ArrayList) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Map(java.util.Map)

Example 3 with GenericRowWithSchema

use of org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema in project Gaffer by gchq.

the class GafferGroupObjectConverter method sparkRowToGafferObject.

/**
 * Extracts an object corresponding to column {@code gafferColumn} from the provided {@link GenericRowWithSchema}.
 *
 * @param gafferColumn the column to extract
 * @param row          the row to extract from
 * @return the extracted {@link Object}
 * @throws SerialisationException if the conversion from Parquet objects to the original object throws a
 *                                {@link SerialisationException}
 */
public Object sparkRowToGafferObject(final String gafferColumn, final Row row) throws SerialisationException {
    final ArrayList<Object> objectsList = new ArrayList<>();
    final String[] paths = columnToPaths.get(gafferColumn);
    if (paths[0].contains(".")) {
        final Object nestedRow = row.getAs(gafferColumn);
        if (null != nestedRow) {
            if (nestedRow instanceof GenericRowWithSchema) {
                getObjectsFromNestedRow(objectsList, (GenericRowWithSchema) nestedRow);
            } else if (nestedRow instanceof WrappedArray) {
                objectsList.add(((WrappedArray) nestedRow).array());
            } else if (nestedRow instanceof scala.collection.Map) {
                objectsList.add(scala.collection.JavaConversions.mapAsJavaMap((scala.collection.Map) nestedRow));
            } else if (nestedRow instanceof Object[]) {
                objectsList.add(nestedRow);
            } else {
                throw new SerialisationException("sparkRowToGafferObject does not know how to deal with a " + nestedRow.getClass().getCanonicalName());
            }
        } else {
            objectsList.add(null);
        }
    } else {
        for (final String path : paths) {
            final Object obj = row.getAs(path);
            objectsList.add(obj);
        }
    }
    final Object[] objects;
    if (paths[0].endsWith("key_value.key")) {
        objects = new Object[1];
    } else {
        objects = new Object[paths.length];
    }
    objectsList.toArray(objects);
    final Object gafferObject = parquetObjectsToGafferObject(gafferColumn, objects);
    if (null == gafferObject) {
        LOGGER.debug("Failed to get the Gaffer Object from the Spark Row for the column: {}", gafferColumn);
    }
    return gafferObject;
}
Also used : SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) WrappedArray(scala.collection.mutable.WrappedArray) ArrayList(java.util.ArrayList) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with GenericRowWithSchema

use of org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema in project Gaffer by gchq.

the class DataGen method generateEdgeRow.

public static GenericRowWithSchema generateEdgeRow(final SchemaUtils utils, final String group, final String src, final String dst, final Boolean directed, final Byte aByte, final Double aDouble, final Float aFloat, final TreeSet<String> treeSet, final Long aLong, final Short aShort, final Date date, final FreqMap freqMap, final String visibility) throws SerialisationException {
    final GafferGroupObjectConverter edgeConverter = new GafferGroupObjectConverter(group, utils.getCoreProperties(group), utils.getCorePropertiesForReversedEdges(), utils.getColumnToSerialiser(group), utils.getSerialisers(), utils.getColumnToPaths(group));
    final List<Object> list = new ArrayList<>();
    final scala.collection.mutable.Map<String, Long> map = new scala.collection.mutable.HashMap<>();
    for (final Map.Entry<String, Long> entry : freqMap.entrySet()) {
        map.put(entry.getKey(), entry.getValue());
    }
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.SOURCE, src)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.DESTINATION, dst)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.DIRECTED, directed)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("byte", aByte)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("double", aDouble)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("float", aFloat)));
    list.add(WrappedArray$.MODULE$.make(edgeConverter.gafferObjectToParquetObjects("treeSet", treeSet)[0]));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("long", aLong)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("short", aShort)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("date", date)));
    list.add(map);
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("count", 1)));
    if (null != visibility) {
        list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(TestTypes.VISIBILITY, visibility)));
    }
    final Object[] objects = new Object[list.size()];
    list.toArray(objects);
    return new GenericRowWithSchema(objects, utils.getSparkSchema(group));
}
Also used : ArrayList(java.util.ArrayList) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Map(java.util.Map)

Example 5 with GenericRowWithSchema

use of org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema in project Gaffer by gchq.

the class GafferGroupObjectConverter method recusivelyGenerateSparkObjects.

private void recusivelyGenerateSparkObjects(final Iterator<Object> parquetObjects, final DataType fieldType, final ArrayList<Object> recordBuilder) throws SerialisationException {
    if (fieldType instanceof StructType) {
        final ArrayList<Object> nestedRecordBuilder = new ArrayList<>();
        for (final String field : ((StructType) fieldType).fieldNames()) {
            final DataType innerDataType = ((StructType) fieldType).apply(field).dataType();
            recusivelyGenerateSparkObjects(parquetObjects, innerDataType, nestedRecordBuilder);
        }
        final Object[] rowObjects = new Object[nestedRecordBuilder.size()];
        nestedRecordBuilder.toArray(rowObjects);
        recordBuilder.add(new GenericRowWithSchema(rowObjects, (StructType) fieldType));
    } else {
        // must be a primitive type
        final Object parquetObject = parquetObjects.next();
        if (parquetObject instanceof Map) {
            recordBuilder.add(scala.collection.JavaConversions.mapAsScalaMap((Map<Object, Object>) parquetObject));
        } else {
            recordBuilder.add(parquetObject);
        }
    }
}
Also used : StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) DataType(org.apache.spark.sql.types.DataType) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

ArrayList (java.util.ArrayList)7 GenericRowWithSchema (org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)7 Map (java.util.Map)4 HashMap (java.util.HashMap)2 Row (org.apache.spark.sql.Row)2 Test (org.junit.jupiter.api.Test)2 AggregateGafferRowsFunction (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateGafferRowsFunction)2 GafferGroupObjectConverter (uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter)2 SchemaElementDefinition (uk.gov.gchq.gaffer.store.schema.SchemaElementDefinition)2 FreqMap (uk.gov.gchq.gaffer.types.FreqMap)2 DataType (org.apache.spark.sql.types.DataType)1 StructType (org.apache.spark.sql.types.StructType)1 WrappedArray (scala.collection.mutable.WrappedArray)1 Properties (uk.gov.gchq.gaffer.data.element.Properties)1 ElementAggregator (uk.gov.gchq.gaffer.data.element.function.ElementAggregator)1 SerialisationException (uk.gov.gchq.gaffer.exception.SerialisationException)1