use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup in project Gaffer by gchq.
the class AggregateDataForGroupTest method aggregateDataForGroupTest.
@Test
public void aggregateDataForGroupTest(@TempDir java.nio.file.Path tempDir) throws Exception {
// Given
final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
final String file1 = tempDir.resolve("inputdata1.parquet").toString();
final String file2 = tempDir.resolve("inputdata2.parquet").toString();
generateData(file1, schemaUtils);
generateData(file2, schemaUtils);
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
final List<String> inputFiles = new ArrayList<>(Sets.newHashSet(file1, file2));
final String outputFolder = tempDir.resolve("aggregated").toString();
final AggregateDataForGroup aggregator = new AggregateDataForGroup(FileSystem.get(new Configuration()), schemaUtils, TestGroups.ENTITY, inputFiles, outputFolder, sparkSession);
// When
aggregator.call();
// Then
final FileSystem fs = FileSystem.get(new Configuration());
assertTrue(fs.exists(new Path(outputFolder)));
final Row[] results = (Row[]) sparkSession.read().parquet(outputFolder).sort(ParquetStore.VERTEX).collect();
for (int i = 0; i < 20; i++) {
assertEquals((long) i, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals('b', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(14f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * i, (long) results[i].getAs("long"));
assertEquals(26, (int) results[i].getAs("short"));
assertEquals(TestUtils.DATE.getTime(), (long) results[i].getAs("date"));
assertEquals(4, (int) results[i].getAs("count"));
assertArrayEquals(new String[] { "A", "B", "C" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap = new FreqMap();
mergedFreqMap.put("A", 4L);
mergedFreqMap.put("B", 2L);
mergedFreqMap.put("C", 2L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(mergedFreqMap), results[i].getAs("freqMap"));
}
}
use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup in project Gaffer by gchq.
the class AddElementsFromRDD method aggregateNewAndOldData.
/**
* For each group that requires aggregation, this method aggregates the new data that has been written out to file
* with the existing data for that group.
*
* @throws OperationException if an {@link IOException} or a {@link SerialisationException} is thrown
*/
private void aggregateNewAndOldData() throws OperationException {
LOGGER.info("Creating AggregateDataForGroup tasks for groups that require aggregation");
for (final String group : schema.getAggregatedGroups()) {
LOGGER.info("Creating AggregateDataForGroup task for group {}", group);
final List<String> inputFiles = new ArrayList<>();
final String groupDirectoryNewData = getDirectory(group, false, false, false);
final FileStatus[] newData;
try {
newData = fs.listStatus(new Path(groupDirectoryNewData), path -> path.getName().endsWith(".parquet"));
} catch (final IOException e) {
throw new OperationException("IOException finding Parquet files in " + groupDirectoryNewData, e);
}
Arrays.stream(newData).map(f -> f.getPath().toString()).forEach(inputFiles::add);
final List<Path> existingData;
try {
existingData = store.getFilesForGroup(group);
} catch (final IOException e) {
throw new OperationException("IOException finding files for group " + group, e);
}
existingData.stream().map(Path::toString).forEach(inputFiles::add);
final String outputDir = getDirectory(group, false, true, false);
final AggregateDataForGroup aggregateDataForGroup;
try {
aggregateDataForGroup = new AggregateDataForGroup(fs, schemaUtils, group, inputFiles, outputDir, spark);
} catch (final SerialisationException e) {
throw new OperationException("SerialisationException creating AggregateDataForGroup task", e);
}
LOGGER.info("AggregateDataForGroup task for group {} is being called ({} files as input, outputting to {})", group, inputFiles.size(), outputDir);
aggregateDataForGroup.call();
}
}
Aggregations