use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.
the class IterableToFreqMapTest method shouldInitialiseTheValueOfTheKeyToOneIfNotSeenBefore.
@Test
public void shouldInitialiseTheValueOfTheKeyToOneIfNotSeenBefore() {
// Given
Iterable<String> strings = (Iterable<String>) Arrays.asList("one");
final IterableToFreqMap iterableToFreqMap = new IterableToFreqMap();
// When
FreqMap result = iterableToFreqMap.apply(strings);
// Then
FreqMap expected = new FreqMap("one");
assertEquals(expected, result);
}
use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.
the class DataGen method generateEdgeRow.
public static GenericRowWithSchema generateEdgeRow(final SchemaUtils utils, final String group, final String src, final String dst, final Boolean directed, final Byte aByte, final Double aDouble, final Float aFloat, final TreeSet<String> treeSet, final Long aLong, final Short aShort, final Date date, final FreqMap freqMap, final String visibility) throws SerialisationException {
final GafferGroupObjectConverter edgeConverter = new GafferGroupObjectConverter(group, utils.getCoreProperties(group), utils.getCorePropertiesForReversedEdges(), utils.getColumnToSerialiser(group), utils.getSerialisers(), utils.getColumnToPaths(group));
final List<Object> list = new ArrayList<>();
final scala.collection.mutable.Map<String, Long> map = new scala.collection.mutable.HashMap<>();
for (final Map.Entry<String, Long> entry : freqMap.entrySet()) {
map.put(entry.getKey(), entry.getValue());
}
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.SOURCE, src)));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.DESTINATION, dst)));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.DIRECTED, directed)));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("byte", aByte)));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("double", aDouble)));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("float", aFloat)));
list.add(WrappedArray$.MODULE$.make(edgeConverter.gafferObjectToParquetObjects("treeSet", treeSet)[0]));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("long", aLong)));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("short", aShort)));
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("date", date)));
list.add(map);
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("count", 1)));
if (null != visibility) {
list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(TestTypes.VISIBILITY, visibility)));
}
final Object[] objects = new Object[list.size()];
list.toArray(objects);
return new GenericRowWithSchema(objects, utils.getSparkSchema(group));
}
use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.
the class AggregateAndSortDataTest method test.
@Test
public void test(@TempDir java.nio.file.Path tempDir) throws Exception {
// Given
final FileSystem fs = FileSystem.get(new Configuration());
final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
final String file1 = tempDir.resolve("inputdata1.parquet").toString();
final String file2 = tempDir.resolve("inputdata2.parquet").toString();
writeData(file1, schemaUtils);
writeData(file2, schemaUtils);
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
final List<String> inputFiles = new ArrayList<>(Sets.newHashSet(file1, file2));
final String outputFolder = tempDir.resolve("aggregated").toString();
// When
new AggregateAndSortData(schemaUtils, fs, inputFiles, outputFolder, TestGroups.ENTITY, "test", false, CompressionCodecName.GZIP, sparkSession).call();
// Then
assertTrue(fs.exists(new Path(outputFolder)));
final Row[] results = (Row[]) sparkSession.read().parquet(outputFolder).collect();
// Should be sorted by vertex and date
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
assertEquals(2, (int) results[i].getAs("count"));
assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap1 = new FreqMap();
mergedFreqMap1.put("A", 2L);
mergedFreqMap1.put("B", 2L);
final FreqMap mergedFreqMap2 = new FreqMap();
mergedFreqMap2.put("A", 2L);
mergedFreqMap2.put("C", 2L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
}
}
use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.
the class AddElementsHandlerTest method testRepeatedCallsOfAddElementsHandler.
@Test
public void testRepeatedCallsOfAddElementsHandler(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException, StoreException {
// Given
final List<Element> elementsToAdd = new ArrayList<>();
// - Data for TestGroups.ENTITY
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
// - Data for TestGroups.ENTITY_2
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
// - Data for TestGroups.EDGE
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)));
// - Data for TestGroups.EDGE_2
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false));
// - Shuffle the list so that the order is random
Collections.shuffle(elementsToAdd);
final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
final Context context = new Context();
final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
final String testDir = tempDir.toString();
storeProperties.setDataDir(testDir + "/data");
storeProperties.setTempFilesDir(testDir + "/tmpdata");
final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
final FileSystem fs = FileSystem.get(new Configuration());
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
// When1 - Add elementsToAdd twice
new AddElementsHandler().doOperation(add, context, store);
new AddElementsHandler().doOperation(add, context, store);
// Then1
// - New snapshot directory should have been created.
long snapshotId = store.getLatestSnapshot();
Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
assertTrue(fs.exists(snapshotPath));
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex and date.
Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(40);
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(i % 2 == 0 ? 16f : 12f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * 2 * (i / 2), (long) results[i].getAs("long"));
assertEquals(i % 2 == 0 ? 28 : 24, (int) results[i].getAs("short"));
assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
assertEquals(4, (int) results[i].getAs("count"));
assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap1 = new FreqMap();
mergedFreqMap1.put("A", 4L);
mergedFreqMap1.put("B", 4L);
final FreqMap mergedFreqMap2 = new FreqMap();
mergedFreqMap2.put("A", 4L);
mergedFreqMap2.put("C", 4L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
}
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex.
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(8);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[1]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[2]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[3]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[4]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[5]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[6]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[7]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
// directory and in the "reversed-group=BasicEdge" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed, date
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[5]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[5]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
// directory and in the "reversed-group=BasicEdge2" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(8);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(8);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
// When2 - Add some elements from only TestGroups.ENTITY (this tests that groups that are unchanged after
// an AddElements operation are correctly copied through to the new snapshot).
elementsToAdd.clear();
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
new AddElementsHandler().doOperation(add, context, store);
// Then1
// - New snapshot directory should have been created.
snapshotId = store.getLatestSnapshot();
snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
assertTrue(fs.exists(snapshotPath));
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex and date.
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(40);
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(i % 2 == 0 ? 16f : 12f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * 2 * (i / 2), (long) results[i].getAs("long"));
assertEquals(i % 2 == 0 ? 28 : 24, (int) results[i].getAs("short"));
assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
assertEquals(4, (int) results[i].getAs("count"));
assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap1 = new FreqMap();
mergedFreqMap1.put("A", 4L);
mergedFreqMap1.put("B", 4L);
final FreqMap mergedFreqMap2 = new FreqMap();
mergedFreqMap2.put("A", 4L);
mergedFreqMap2.put("C", 4L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
}
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex.
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(12);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[1]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[2]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[3]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[4]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[5]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[6]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[7]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[8]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[9]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[10]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[11]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
// directory and in the "reversed-group=BasicEdge" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed, date
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[5]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[5]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
// directory and in the "reversed-group=BasicEdge2" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(8);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(8);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
// directory and in the "reversed-group=BasicEdge2" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(8);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(8);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
}
use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.
the class AddElementsHandlerTest method testOnePartitionOneGroup.
@Test
public void testOnePartitionOneGroup(@TempDir java.nio.file.Path tempDir) throws OperationException, IOException, StoreException {
// Given
final List<Element> elementsToAdd = new ArrayList<>();
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
final Context context = new Context();
final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
final String testDir = tempDir.toString();
storeProperties.setDataDir(testDir + "/data");
storeProperties.setTempFilesDir(testDir + "/tmpdata");
final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
final FileSystem fs = FileSystem.get(new Configuration());
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
// When
new AddElementsHandler().doOperation(add, context, store);
// Then
// - New snapshot directory should have been created.
final long snapshotId = store.getLatestSnapshot();
final Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
assertTrue(fs.exists(snapshotPath));
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex and date.
final Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(40);
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
assertEquals(2, (int) results[i].getAs("count"));
assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap1 = new FreqMap();
mergedFreqMap1.put("A", 2L);
mergedFreqMap1.put("B", 2L);
final FreqMap mergedFreqMap2 = new FreqMap();
mergedFreqMap2.put("A", 2L);
mergedFreqMap2.put("C", 2L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
}
}
Aggregations