use of scala.collection.mutable.WrappedArray in project Gaffer by gchq.
the class AggregateDataForGroupTest method aggregateDataForGroupTest.
@Test
public void aggregateDataForGroupTest(@TempDir java.nio.file.Path tempDir) throws Exception {
// Given
final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
final String file1 = tempDir.resolve("inputdata1.parquet").toString();
final String file2 = tempDir.resolve("inputdata2.parquet").toString();
generateData(file1, schemaUtils);
generateData(file2, schemaUtils);
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
final List<String> inputFiles = new ArrayList<>(Sets.newHashSet(file1, file2));
final String outputFolder = tempDir.resolve("aggregated").toString();
final AggregateDataForGroup aggregator = new AggregateDataForGroup(FileSystem.get(new Configuration()), schemaUtils, TestGroups.ENTITY, inputFiles, outputFolder, sparkSession);
// When
aggregator.call();
// Then
final FileSystem fs = FileSystem.get(new Configuration());
assertTrue(fs.exists(new Path(outputFolder)));
final Row[] results = (Row[]) sparkSession.read().parquet(outputFolder).sort(ParquetStore.VERTEX).collect();
for (int i = 0; i < 20; i++) {
assertEquals((long) i, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals('b', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(14f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * i, (long) results[i].getAs("long"));
assertEquals(26, (int) results[i].getAs("short"));
assertEquals(TestUtils.DATE.getTime(), (long) results[i].getAs("date"));
assertEquals(4, (int) results[i].getAs("count"));
assertArrayEquals(new String[] { "A", "B", "C" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap = new FreqMap();
mergedFreqMap.put("A", 4L);
mergedFreqMap.put("B", 2L);
mergedFreqMap.put("C", 2L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(mergedFreqMap), results[i].getAs("freqMap"));
}
}
use of scala.collection.mutable.WrappedArray in project Gaffer by gchq.
the class SortGroupSplitTest method sortTest.
@Test
public void sortTest(@TempDir java.nio.file.Path tempDir) throws IOException {
// Given
final FileSystem fs = FileSystem.get(new Configuration());
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
final String inputDir = Files.createDirectories(tempDir.resolve("input")).toString();
final String outputDir = tempDir.resolve("output").toString();
generateDate(inputDir);
final List<String> sortColumns = new ArrayList<>();
sortColumns.add(ParquetStore.VERTEX);
sortColumns.add("date");
// When
new SortGroupSplit(fs, sparkSession, sortColumns, inputDir, outputDir, CompressionCodecName.GZIP).call();
// Then
// - Check output directory exists and contains one Parquet file
assertTrue(fs.exists(new Path(outputDir)));
final FileStatus[] outputFiles = fs.listStatus(new Path(outputDir), path1 -> path1.getName().endsWith(".parquet"));
assertThat(outputFiles).hasSize(1);
// - Read results and check in correct order
final Row[] results = (Row[]) sparkSession.read().parquet(outputFiles[0].getPath().toString()).collect();
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals('b', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(7f, results[i].getAs("float"), 0.01f);
assertEquals(11L * (i / 2), (long) results[i].getAs("long"));
assertEquals(13, (int) results[i].getAs("short"));
if (i % 2 == 0) {
assertEquals(new Date(100000L).getTime(), (long) results[i].getAs("date"));
} else {
assertEquals(new Date(200000L).getTime(), (long) results[i].getAs("date"));
}
assertEquals(2, (int) results[i].getAs("count"));
assertArrayEquals(new String[] { "A", "B", "C" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(TestUtils.MERGED_FREQMAP), results[i].getAs("freqMap"));
}
}
use of scala.collection.mutable.WrappedArray in project Gaffer by gchq.
the class AddElementsHandlerTest method testOnePartitionAllGroups.
@Test
public void testOnePartitionAllGroups(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException, StoreException {
// Given
final List<Element> elementsToAdd = new ArrayList<>();
// - Data for TestGroups.ENTITY
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
// - Data for TestGroups.ENTITY_2
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
// - Data for TestGroups.EDGE
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)));
// - Data for TestGroups.EDGE_2
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false));
// - Shuffle the list so that the order is random
Collections.shuffle(elementsToAdd);
final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
final Context context = new Context();
final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
final String testDir = tempDir.toString();
storeProperties.setDataDir(testDir + "/data");
storeProperties.setTempFilesDir(testDir + "/tmpdata");
final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
final FileSystem fs = FileSystem.get(new Configuration());
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
// When
new AddElementsHandler().doOperation(add, context, store);
// Then
// - New snapshot directory should have been created.
final long snapshotId = store.getLatestSnapshot();
final Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
assertTrue(fs.exists(snapshotPath));
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex and date.
Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(40);
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
assertEquals(2, (int) results[i].getAs("count"));
assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap1 = new FreqMap();
mergedFreqMap1.put("A", 2L);
mergedFreqMap1.put("B", 2L);
final FreqMap mergedFreqMap2 = new FreqMap();
mergedFreqMap2.put("A", 2L);
mergedFreqMap2.put("C", 2L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
}
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex.
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, "graph/group=BasicEntity2/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(4);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[1]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[2]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[3]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
// directory and in the "reversed-group=BasicEdge" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed, date
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)), results[5]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)), results[5]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
// directory and in the "reversed-group=BasicEdge2" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(4);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[3]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(4);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[3]);
}
use of scala.collection.mutable.WrappedArray in project Gaffer by gchq.
the class GafferGroupObjectConverter method sparkRowToGafferObject.
/**
* Extracts an object corresponding to column {@code gafferColumn} from the provided {@link GenericRowWithSchema}.
*
* @param gafferColumn the column to extract
* @param row the row to extract from
* @return the extracted {@link Object}
* @throws SerialisationException if the conversion from Parquet objects to the original object throws a
* {@link SerialisationException}
*/
public Object sparkRowToGafferObject(final String gafferColumn, final Row row) throws SerialisationException {
final ArrayList<Object> objectsList = new ArrayList<>();
final String[] paths = columnToPaths.get(gafferColumn);
if (paths[0].contains(".")) {
final Object nestedRow = row.getAs(gafferColumn);
if (null != nestedRow) {
if (nestedRow instanceof GenericRowWithSchema) {
getObjectsFromNestedRow(objectsList, (GenericRowWithSchema) nestedRow);
} else if (nestedRow instanceof WrappedArray) {
objectsList.add(((WrappedArray) nestedRow).array());
} else if (nestedRow instanceof scala.collection.Map) {
objectsList.add(scala.collection.JavaConversions.mapAsJavaMap((scala.collection.Map) nestedRow));
} else if (nestedRow instanceof Object[]) {
objectsList.add(nestedRow);
} else {
throw new SerialisationException("sparkRowToGafferObject does not know how to deal with a " + nestedRow.getClass().getCanonicalName());
}
} else {
objectsList.add(null);
}
} else {
for (final String path : paths) {
final Object obj = row.getAs(path);
objectsList.add(obj);
}
}
final Object[] objects;
if (paths[0].endsWith("key_value.key")) {
objects = new Object[1];
} else {
objects = new Object[paths.length];
}
objectsList.toArray(objects);
final Object gafferObject = parquetObjectsToGafferObject(gafferColumn, objects);
if (null == gafferObject) {
LOGGER.debug("Failed to get the Gaffer Object from the Spark Row for the column: {}", gafferColumn);
}
return gafferObject;
}
use of scala.collection.mutable.WrappedArray in project Gaffer by gchq.
the class AggregateAndSortDataTest method test.
@Test
public void test(@TempDir java.nio.file.Path tempDir) throws Exception {
// Given
final FileSystem fs = FileSystem.get(new Configuration());
final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
final String file1 = tempDir.resolve("inputdata1.parquet").toString();
final String file2 = tempDir.resolve("inputdata2.parquet").toString();
writeData(file1, schemaUtils);
writeData(file2, schemaUtils);
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
final List<String> inputFiles = new ArrayList<>(Sets.newHashSet(file1, file2));
final String outputFolder = tempDir.resolve("aggregated").toString();
// When
new AggregateAndSortData(schemaUtils, fs, inputFiles, outputFolder, TestGroups.ENTITY, "test", false, CompressionCodecName.GZIP, sparkSession).call();
// Then
assertTrue(fs.exists(new Path(outputFolder)));
final Row[] results = (Row[]) sparkSession.read().parquet(outputFolder).collect();
// Should be sorted by vertex and date
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
assertEquals(2, (int) results[i].getAs("count"));
assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap1 = new FreqMap();
mergedFreqMap1.put("A", 2L);
mergedFreqMap1.put("B", 2L);
final FreqMap mergedFreqMap2 = new FreqMap();
mergedFreqMap2.put("A", 2L);
mergedFreqMap2.put("C", 2L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
}
}
Aggregations