Search in sources :

Example 1 with AggregateDataForGroup

use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup in project Gaffer by gchq.

the class AggregateDataForGroupTest method aggregateDataForGroupTest.

@Test
public void aggregateDataForGroupTest(@TempDir java.nio.file.Path tempDir) throws Exception {
    // Given
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final String file1 = tempDir.resolve("inputdata1.parquet").toString();
    final String file2 = tempDir.resolve("inputdata2.parquet").toString();
    generateData(file1, schemaUtils);
    generateData(file2, schemaUtils);
    final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
    final List<String> inputFiles = new ArrayList<>(Sets.newHashSet(file1, file2));
    final String outputFolder = tempDir.resolve("aggregated").toString();
    final AggregateDataForGroup aggregator = new AggregateDataForGroup(FileSystem.get(new Configuration()), schemaUtils, TestGroups.ENTITY, inputFiles, outputFolder, sparkSession);
    // When
    aggregator.call();
    // Then
    final FileSystem fs = FileSystem.get(new Configuration());
    assertTrue(fs.exists(new Path(outputFolder)));
    final Row[] results = (Row[]) sparkSession.read().parquet(outputFolder).sort(ParquetStore.VERTEX).collect();
    for (int i = 0; i < 20; i++) {
        assertEquals((long) i, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals('b', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(14f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * i, (long) results[i].getAs("long"));
        assertEquals(26, (int) results[i].getAs("short"));
        assertEquals(TestUtils.DATE.getTime(), (long) results[i].getAs("date"));
        assertEquals(4, (int) results[i].getAs("count"));
        assertArrayEquals(new String[] { "A", "B", "C" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap = new FreqMap();
        mergedFreqMap.put("A", 4L);
        mergedFreqMap.put("B", 2L);
        mergedFreqMap.put("C", 2L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(mergedFreqMap), results[i].getAs("freqMap"));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) Configuration(org.apache.hadoop.conf.Configuration) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) AggregateDataForGroup(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup) ArrayList(java.util.ArrayList) WrappedArray(scala.collection.mutable.WrappedArray) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) Test(org.junit.jupiter.api.Test)

Example 2 with AggregateDataForGroup

use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup in project Gaffer by gchq.

the class AddElementsFromRDD method aggregateNewAndOldData.

/**
 * For each group that requires aggregation, this method aggregates the new data that has been written out to file
 * with the existing data for that group.
 *
 * @throws OperationException if an {@link IOException} or a {@link SerialisationException} is thrown
 */
private void aggregateNewAndOldData() throws OperationException {
    LOGGER.info("Creating AggregateDataForGroup tasks for groups that require aggregation");
    for (final String group : schema.getAggregatedGroups()) {
        LOGGER.info("Creating AggregateDataForGroup task for group {}", group);
        final List<String> inputFiles = new ArrayList<>();
        final String groupDirectoryNewData = getDirectory(group, false, false, false);
        final FileStatus[] newData;
        try {
            newData = fs.listStatus(new Path(groupDirectoryNewData), path -> path.getName().endsWith(".parquet"));
        } catch (final IOException e) {
            throw new OperationException("IOException finding Parquet files in " + groupDirectoryNewData, e);
        }
        Arrays.stream(newData).map(f -> f.getPath().toString()).forEach(inputFiles::add);
        final List<Path> existingData;
        try {
            existingData = store.getFilesForGroup(group);
        } catch (final IOException e) {
            throw new OperationException("IOException finding files for group " + group, e);
        }
        existingData.stream().map(Path::toString).forEach(inputFiles::add);
        final String outputDir = getDirectory(group, false, true, false);
        final AggregateDataForGroup aggregateDataForGroup;
        try {
            aggregateDataForGroup = new AggregateDataForGroup(fs, schemaUtils, group, inputFiles, outputDir, spark);
        } catch (final SerialisationException e) {
            throw new OperationException("SerialisationException creating AggregateDataForGroup task", e);
        }
        LOGGER.info("AggregateDataForGroup task for group {} is being called ({} files as input, outputting to {})", group, inputFiles.size(), outputDir);
        aggregateDataForGroup.call();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) StoreException(uk.gov.gchq.gaffer.store.StoreException) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Function(java.util.function.Function) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) CalculatePartitioner(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitioner) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) SparkParquetUtils(uk.gov.gchq.gaffer.parquetstore.utils.SparkParquetUtils) SortFullGroup(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.SortFullGroup) Path(org.apache.hadoop.fs.Path) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) AggregateDataForGroup(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup) Logger(org.slf4j.Logger) WriteData(uk.gov.gchq.gaffer.parquetstore.operation.handler.spark.utilities.WriteData) SparkContextUtil(uk.gov.gchq.gaffer.spark.SparkContextUtil) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) IOException(java.io.IOException) List(java.util.List) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Context(uk.gov.gchq.gaffer.store.Context) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) OperationException(uk.gov.gchq.gaffer.operation.OperationException) RDD(org.apache.spark.rdd.RDD) FileStatus(org.apache.hadoop.fs.FileStatus) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) AggregateDataForGroup(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup) ArrayList(java.util.ArrayList) IOException(java.io.IOException) OperationException(uk.gov.gchq.gaffer.operation.OperationException)

Aggregations

ArrayList (java.util.ArrayList)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 SparkSession (org.apache.spark.sql.SparkSession)2 AggregateDataForGroup (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup)2 IOException (java.io.IOException)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Function (java.util.function.Function)1 Configuration (org.apache.hadoop.conf.Configuration)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 JavaRDD (org.apache.spark.api.java.JavaRDD)1 RDD (org.apache.spark.rdd.RDD)1 Row (org.apache.spark.sql.Row)1 Test (org.junit.jupiter.api.Test)1 Logger (org.slf4j.Logger)1 LoggerFactory (org.slf4j.LoggerFactory)1 WrappedArray (scala.collection.mutable.WrappedArray)1 Element (uk.gov.gchq.gaffer.data.element.Element)1