use of uk.gov.gchq.gaffer.parquetstore.ParquetStore in project Gaffer by gchq.
the class QueryGeneratorTest method testQueryGeneratorForGetAllElements.
@Test
public void testQueryGeneratorForGetAllElements(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
// Given
// - Create snapshot folder
final String folder = String.format("file:///%s", tempDir.toString());
final String snapshotFolder = folder + "/" + ParquetStore.getSnapshotPath(1000L);
// - Write out Parquet files so know the partitioning
CalculatePartitionerTest.writeData(snapshotFolder, new SchemaUtils(schema));
// - Initialise store
final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
storeProperties.setDataDir(folder);
storeProperties.setTempFilesDir(folder + "/tmpdata");
final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
// When 1 - no view
GetAllElements getAllElements = new GetAllElements.Builder().build();
ParquetQuery query = new QueryGenerator(store).getParquetQuery(getAllElements);
// Then 1
final List expected = new ArrayList<>();
for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2, TestGroups.EDGE, TestGroups.EDGE_2)) {
final Path groupFolderPath = store.getGroupPath(group);
for (int partition = 0; partition < 10; partition++) {
final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
expected.add(new ParquetFileQuery(pathForPartitionFile, null, true));
}
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 2 - simple view that restricts to one group
getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE).build()).build();
query = new QueryGenerator(store).getParquetQuery(getAllElements);
// Then 2
expected.clear();
Path groupFolderPath = store.getGroupPath(TestGroups.EDGE);
for (int partition = 0; partition < 10; partition++) {
final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
expected.add(new ParquetFileQuery(pathForPartitionFile, null, true));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 3 - view with filter that can be pushed down to Parquet
getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsMoreThan(10)).build()).build()).build()).build();
query = new QueryGenerator(store).getParquetQuery(getAllElements);
// Then 3
expected.clear();
for (int partition = 0; partition < 10; partition++) {
final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
expected.add(new ParquetFileQuery(pathForPartitionFile, gt(FilterApi.intColumn("count"), 10), true));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 4 - view with filter that can't be pushed down to Parquet
getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsEvenFilter()).build()).build()).build()).build();
query = new QueryGenerator(store).getParquetQuery(getAllElements);
// Then 4
expected.clear();
for (int partition = 0; partition < 10; partition++) {
final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
expected.add(new ParquetFileQuery(pathForPartitionFile, null, false));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 5 - view with one filter that can be pushed down and one that can't
getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsEvenFilter()).select("count").execute(new IsMoreThan(10)).build()).build()).build()).build();
query = new QueryGenerator(store).getParquetQuery(getAllElements);
// Then 5
expected.clear();
for (int partition = 0; partition < 10; partition++) {
final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
expected.add(new ParquetFileQuery(pathForPartitionFile, gt(FilterApi.intColumn("count"), 10), false));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
}
use of uk.gov.gchq.gaffer.parquetstore.ParquetStore in project Gaffer by gchq.
the class QueryGeneratorTest method testQueryGeneratorForGetElementsWithEntitySeeds.
@Test
public void testQueryGeneratorForGetElementsWithEntitySeeds(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
// Given
// - Create snapshot folder
final String folder = String.format("file:///%s", tempDir.toString());
final String snapshotFolder = folder + "/" + ParquetStore.getSnapshotPath(1000L);
// - Write out Parquet files so know the partitioning
CalculatePartitionerTest.writeData(snapshotFolder, new SchemaUtils(schema));
// - Initialise store
final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
storeProperties.setDataDir(folder);
storeProperties.setTempFilesDir(folder + "/tmpdata");
final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
// When 1 - no view, query for vertex 0
GetElements getElements = new GetElements.Builder().input(new EntitySeed(0L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).build();
ParquetQuery query = new QueryGenerator(store).getParquetQuery(getElements);
// Then 1
final List expected = new ArrayList<>();
final FilterPredicate vertex0 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 0L);
final FilterPredicate source0 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 0L);
final FilterPredicate destination0 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 0L);
for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2)) {
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartitionFile, vertex0, true));
}
for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartitionFile, source0, true));
final Path reversedGroupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, true));
final Path pathForReversedPartitionFile = new Path(reversedGroupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForReversedPartitionFile, destination0, true));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 2 - no view, query for vertices 0 and 1000000
getElements = new GetElements.Builder().input(new EntitySeed(0L), new EntitySeed(1000000L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).build();
query = new QueryGenerator(store).getParquetQuery(getElements);
// Then 2
expected.clear();
final FilterPredicate vertex1000000 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 1000000L);
final FilterPredicate source1000000 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 1000000L);
final FilterPredicate destination1000000 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1000000L);
for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2)) {
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
final Path pathForPartitionFile1 = new Path(groupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartitionFile1, vertex0, true));
final Path pathForPartitionFile2 = new Path(groupFolderPath, ParquetStore.getFile(9));
expected.add(new ParquetFileQuery(pathForPartitionFile2, vertex1000000, true));
}
for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
final Path reversedGroupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, true));
// Partition 0, vertex 0L
final Path pathForPartitionFile1 = new Path(groupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartitionFile1, source0, true));
// Partition 9, vertex 1000000L
final Path pathForPartitionFile2 = new Path(groupFolderPath, ParquetStore.getFile(9));
expected.add(new ParquetFileQuery(pathForPartitionFile2, source1000000, true));
// Partition 0 of reversed, vertex 0L
final Path pathForPartitionFile3 = new Path(reversedGroupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartitionFile3, destination0, true));
// Partition 9 of reversed, vertex 1000000L
final Path pathForPartitionFile4 = new Path(reversedGroupFolderPath, ParquetStore.getFile(9));
expected.add(new ParquetFileQuery(pathForPartitionFile4, destination1000000, true));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 3 - view with filter that can be pushed down to Parquet, query for vertices 0 and 1000000
getElements = new GetElements.Builder().input(new EntitySeed(0L), new EntitySeed(1000000L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsMoreThan(10)).build()).build()).build()).build();
query = new QueryGenerator(store).getParquetQuery(getElements);
// Then 3
expected.clear();
final FilterPredicate source0AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.SOURCE), 0L));
final FilterPredicate source1000000AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.SOURCE), 1000000L));
final FilterPredicate destination0AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.DESTINATION), 0L));
final FilterPredicate destination1000000AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1000000L));
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(TestGroups.EDGE, false));
final Path reversedGroupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(TestGroups.EDGE, true));
// Partition 0, vertex 0L
final Path pathForPartitionFile1 = new Path(groupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartitionFile1, source0AndCount, true));
// Partition 9, vertex 1000000L
final Path pathForPartitionFile2 = new Path(groupFolderPath, ParquetStore.getFile(9));
expected.add(new ParquetFileQuery(pathForPartitionFile2, source1000000AndCount, true));
// Partition 0 of reversed, vertex 0L
final Path pathForPartitionFile3 = new Path(reversedGroupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartitionFile3, destination0AndCount, true));
// Partition 9 of reversed, vertex 1000000L
final Path pathForPartitionFile4 = new Path(reversedGroupFolderPath, ParquetStore.getFile(9));
expected.add(new ParquetFileQuery(pathForPartitionFile4, destination1000000AndCount, true));
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 4 - view with filter that can't be pushed down to Parquet, query for vertices 0 and 1000000
getElements = new GetElements.Builder().input(new EntitySeed(0L), new EntitySeed(1000000L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsEvenFilter()).build()).build()).build()).build();
query = new QueryGenerator(store).getParquetQuery(getElements);
// Then 4
expected.clear();
// Partition 0, vertex 0L
expected.add(new ParquetFileQuery(pathForPartitionFile1, source0, false));
// Partition 9, vertex 1000000L
expected.add(new ParquetFileQuery(pathForPartitionFile2, source1000000, false));
// Partition 0 of reversed, vertex 0L
expected.add(new ParquetFileQuery(pathForPartitionFile3, destination0, false));
// Partition 9 of reversed, vertex 1000000L
expected.add(new ParquetFileQuery(pathForPartitionFile4, destination1000000, false));
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
}
use of uk.gov.gchq.gaffer.parquetstore.ParquetStore in project Gaffer by gchq.
the class QueryGeneratorTest method testQueryGeneratorForGetElementsWithEdgeSeeds.
@Test
public void testQueryGeneratorForGetElementsWithEdgeSeeds(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
// Given
// - Create snapshot folder
final String folder = String.format("file:///%s", tempDir.toString());
final String snapshotFolder = folder + "/" + ParquetStore.getSnapshotPath(1000L);
// - Write out Parquet files so know the partitioning
CalculatePartitionerTest.writeData(snapshotFolder, new SchemaUtils(schema));
// - Initialise store
final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
storeProperties.setDataDir(folder);
storeProperties.setTempFilesDir(folder + "/tmpdata");
final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
// When 1 - no view, query for edges 0->1, 10--10000, 10000--10 with seed matching type set to EQUAL
GetElements getElements = new GetElements.Builder().input(new EdgeSeed(0L, 1L, DirectedType.DIRECTED), new EdgeSeed(10L, 1000L, DirectedType.UNDIRECTED), new EdgeSeed(10000L, 10L, DirectedType.EITHER)).seedMatching(SeedMatching.SeedMatchingType.EQUAL).build();
ParquetQuery query = new QueryGenerator(store).getParquetQuery(getElements);
// Then 1
final List expected = new ArrayList<>();
final FilterPredicate source0 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 0L);
final FilterPredicate source10 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 10L);
final FilterPredicate source10000 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 10000L);
final FilterPredicate destination1 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1L);
final FilterPredicate destination10 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 10L);
final FilterPredicate destination1000 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1000L);
final FilterPredicate directedTrue = eq(FilterApi.booleanColumn(ParquetStore.DIRECTED), true);
final FilterPredicate directedFalse = eq(FilterApi.booleanColumn(ParquetStore.DIRECTED), false);
final FilterPredicate source0Destination1DirectedTrue = and(and(source0, destination1), directedTrue);
final FilterPredicate source10Destination1000DirectedFalse = and(and(source10, destination1000), directedFalse);
final FilterPredicate source10000Destination10DirectedEither = and(source10000, destination10);
for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
// 0->1 partition 0 of forward
final Path pathForPartition0File = new Path(groupFolderPath, ParquetStore.getFile(0));
// Comment here that don't need to look in the reversed directory
expected.add(new ParquetFileQuery(pathForPartition0File, source0Destination1DirectedTrue, true));
// 10--1000 partition 1 of forward
final Path pathForPartition1File = new Path(groupFolderPath, ParquetStore.getFile(1));
// Comment here that don't need to look in the reversed directory
expected.add(new ParquetFileQuery(pathForPartition1File, source10Destination1000DirectedFalse, true));
// 10000--10 partition 9 of forward
final Path pathForPartition9File = new Path(groupFolderPath, ParquetStore.getFile(9));
// Comment here that don't need to look in the reversed directory
expected.add(new ParquetFileQuery(pathForPartition9File, source10000Destination10DirectedEither, true));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
// When 2 - no view, query for edges 0->1, 10--10000, 10000--10 with seed matching type set to RELATED
getElements = new GetElements.Builder().input(new EdgeSeed(0L, 1L, DirectedType.DIRECTED), new EdgeSeed(10L, 1000L, DirectedType.UNDIRECTED), new EdgeSeed(10000L, 10L, DirectedType.EITHER)).seedMatching(SeedMatching.SeedMatchingType.RELATED).build();
query = new QueryGenerator(store).getParquetQuery(getElements);
// Then 2
expected.clear();
final FilterPredicate vertex0 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 0L);
final FilterPredicate vertex1 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 1L);
final FilterPredicate vertex10 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 10L);
final FilterPredicate vertex1000 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 1000L);
final FilterPredicate vertex10000 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 10000L);
final FilterPredicate vertex0or1 = or(vertex0, vertex1);
final FilterPredicate vertex10or1000 = or(vertex10, vertex1000);
final FilterPredicate vertex10000or10 = or(vertex10000, vertex10);
for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2)) {
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
// 0 and 1 in partition 0
final Path pathForPartition0File = new Path(groupFolderPath, ParquetStore.getFile(0));
expected.add(new ParquetFileQuery(pathForPartition0File, vertex0or1, true));
// 10 or 1000 and 10000 or 10 in partition 1 (NB 1000 and 10000 't appear in partition 1 but this doesn't cause any incorrect results, and will be fixed in later versions)
final Path pathForPartition1File = new Path(groupFolderPath, ParquetStore.getFile(1));
expected.add(new ParquetFileQuery(pathForPartition1File, or(vertex10or1000, vertex10000or10), true));
// 10 or 1000 and 1000 or 10000 in partition 9
final Path pathForPartition9File = new Path(groupFolderPath, ParquetStore.getFile(9));
expected.add(new ParquetFileQuery(pathForPartition9File, or(vertex10or1000, vertex10000or10), true));
}
for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
// 0->1 partition 0 of forward
final Path pathForPartition0File = new Path(groupFolderPath, ParquetStore.getFile(0));
// Comment here that don't need to look in the reversed directory
expected.add(new ParquetFileQuery(pathForPartition0File, source0Destination1DirectedTrue, true));
// 10--1000 partition 1 of forward
final Path pathForPartition1File = new Path(groupFolderPath, ParquetStore.getFile(1));
// Comment here that don't need to look in the reversed directory
expected.add(new ParquetFileQuery(pathForPartition1File, source10Destination1000DirectedFalse, true));
// 10000--10 partition 9 of forward
final Path pathForPartition9File = new Path(groupFolderPath, ParquetStore.getFile(9));
// Comment here that don't need to look in the reversed directory
expected.add(new ParquetFileQuery(pathForPartition9File, source10000Destination10DirectedEither, true));
}
assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
}
use of uk.gov.gchq.gaffer.parquetstore.ParquetStore in project Gaffer by gchq.
the class AddElementsHandlerTest method testOnePartitionAllGroups.
@Test
public void testOnePartitionAllGroups(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException, StoreException {
// Given
final List<Element> elementsToAdd = new ArrayList<>();
// - Data for TestGroups.ENTITY
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
// - Data for TestGroups.ENTITY_2
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
// - Data for TestGroups.EDGE
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)));
// - Data for TestGroups.EDGE_2
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true));
elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false));
// - Shuffle the list so that the order is random
Collections.shuffle(elementsToAdd);
final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
final Context context = new Context();
final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
final String testDir = tempDir.toString();
storeProperties.setDataDir(testDir + "/data");
storeProperties.setTempFilesDir(testDir + "/tmpdata");
final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
final FileSystem fs = FileSystem.get(new Configuration());
final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
// When
new AddElementsHandler().doOperation(add, context, store);
// Then
// - New snapshot directory should have been created.
final long snapshotId = store.getLatestSnapshot();
final Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
assertTrue(fs.exists(snapshotPath));
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex and date.
Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(40);
for (int i = 0; i < 40; i++) {
assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
assertEquals(2, (int) results[i].getAs("count"));
assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
final FreqMap mergedFreqMap1 = new FreqMap();
mergedFreqMap1.put("A", 2L);
mergedFreqMap1.put("B", 2L);
final FreqMap mergedFreqMap2 = new FreqMap();
mergedFreqMap2.put("A", 2L);
mergedFreqMap2.put("C", 2L);
assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
}
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
// directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by vertex.
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, "graph/group=BasicEntity2/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(4);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[1]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[2]);
checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[3]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
// directory and in the "reversed-group=BasicEdge" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed, date
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)), results[5]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(6);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)), results[3]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)), results[4]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)), results[5]);
// - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
// directory and in the "reversed-group=BasicEdge2" directory.
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
// - The files should contain the data sorted by source, destination, directed
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(4);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[3]);
results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
assertThat(results).hasSize(4);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[1]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[2]);
checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[3]);
}
use of uk.gov.gchq.gaffer.parquetstore.ParquetStore in project Gaffer by gchq.
the class AddElementsHandler method addElements.
private void addElements(final AddElements addElementsOperation, final Context context, final ParquetStore store) throws OperationException {
// Set up
final FileSystem fs = store.getFS();
final Schema schema = store.getSchema();
final SchemaUtils schemaUtils = store.getSchemaUtils();
final SparkSession spark = SparkContextUtil.getSparkSession(context, store.getProperties());
final ExecutorService threadPool = createThreadPool(spark, store.getProperties());
final GraphPartitioner currentGraphPartitioner = store.getGraphPartitioner();
SparkParquetUtils.configureSparkForAddElements(spark, store.getProperties());
// Write data from addElementsOperation split by group and partition (NB this uses the existing partitioner -
// adding elements using this operation does not effect the partitions).
final String tmpDirectory = store.getTempFilesDir();
final BiFunction<String, Integer, String> directoryForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/group=" + group + "/partition=" + partitionId;
final BiFunction<String, Integer, String> directoryForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/reversed-group=" + group + "/partition=" + partitionId;
LOGGER.info("Calling WriteUnsortedData to add elements");
LOGGER.trace("currentGraphPartitioner is {}", currentGraphPartitioner);
new WriteUnsortedData(store, currentGraphPartitioner, directoryForGroupAndPartitionId, directoryForGroupAndPartitionIdForReversedEdges).writeElements(addElementsOperation.getInput());
// For every group and partition, aggregate the new data with the old data and then sort
final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/group=" + group + "/partition=" + partitionId;
final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/REVERSED-group=" + group + "/partition=" + partitionId;
final List<Callable<CallableResult>> tasks = new ArrayList<>();
for (final String group : schema.getGroups()) {
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
for (final Partition partition : partitions) {
final List<String> inputFiles = new ArrayList<>();
// New data
inputFiles.add(directoryForGroupAndPartitionId.apply(group, partition.getPartitionId()));
// Old data
inputFiles.add(store.getFile(group, partition));
final String outputDir = directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId());
final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, group + "-" + partition.getPartitionId(), false, store.getProperties().getCompressionCodecName(), spark);
tasks.add(task);
LOGGER.info("Created AggregateAndSortData task for group {}, partition {}", group, partition.getPartitionId());
}
}
for (final String group : schema.getEdgeGroups()) {
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
for (final Partition partition : partitions) {
final List<String> inputFiles = new ArrayList<>();
// New data
inputFiles.add(directoryForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
// Old data
inputFiles.add(store.getFileForReversedEdges(group, partition));
final String outputDir = directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId());
final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, "reversed-" + group + "-" + partition.getPartitionId(), true, store.getProperties().getCompressionCodecName(), spark);
tasks.add(task);
LOGGER.info("Created AggregateAndSortData task for reversed edge group {}, partition {}", group, partition.getPartitionId());
}
}
try {
LOGGER.info("Invoking {} AggregateAndSortData tasks", tasks.size());
final List<Future<CallableResult>> futures = threadPool.invokeAll(tasks);
for (final Future<CallableResult> future : futures) {
final CallableResult result = future.get();
LOGGER.info("Result {} from task", result);
}
} catch (final InterruptedException e) {
throw new OperationException("InterruptedException running AggregateAndSortData tasks", e);
} catch (final ExecutionException e) {
throw new OperationException("ExecutionException running AggregateAndSortData tasks", e);
}
try {
// Move results to a new snapshot directory (the -tmp at the end allows us to add data to the directory,
// and then when this is all finished we rename the directory to remove the -tmp; this allows us to make
// the replacement of the old data with the new data an atomic operation and ensures that a get operation
// against the store will not read the directory when only some of the data has been moved there).
final long snapshot = System.currentTimeMillis();
final String newDataDir = store.getDataDir() + "/" + ParquetStore.getSnapshotPath(snapshot) + "-tmp";
LOGGER.info("Moving aggregated and sorted data to new snapshot directory {}", newDataDir);
fs.mkdirs(new Path(newDataDir));
for (final String group : schema.getGroups()) {
final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, false));
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
for (final String group : schema.getEdgeGroups()) {
final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, true));
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
for (final String group : schema.getGroups()) {
final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, false);
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
for (final Partition partition : partitions) {
final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId()));
if (!fs.exists(outputDir)) {
LOGGER.info("Not moving data for group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
} else {
// One .parquet file and one .parquet.crc file
final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
if (1 != status.length) {
LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
} else {
final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
fs.rename(status[0].getPath(), destination);
}
}
}
}
for (final String group : schema.getEdgeGroups()) {
final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, true);
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
for (final Partition partition : partitions) {
final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
if (!fs.exists(outputDir)) {
LOGGER.info("Not moving data for reversed edge group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
} else {
// One .parquet file and one .parquet.crc file
final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
if (1 != status.length) {
LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
} else {
final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
fs.rename(status[0].getPath(), destination);
}
}
}
}
// Delete temporary data directory
LOGGER.info("Deleting temporary directory {}", tmpDirectory);
fs.delete(new Path(tmpDirectory), true);
// Write out graph partitioner (unchanged from previous one)
final Path newGraphPartitionerPath = new Path(newDataDir + "/graphPartitioner");
final FSDataOutputStream stream = fs.create(newGraphPartitionerPath);
LOGGER.info("Writing graph partitioner to {}", newGraphPartitionerPath);
new GraphPartitionerSerialiser().write(currentGraphPartitioner, stream);
stream.close();
// Move snapshot-tmp directory to snapshot
final String directoryWithoutTmp = newDataDir.substring(0, newDataDir.lastIndexOf("-tmp"));
LOGGER.info("Renaming {} to {}", newDataDir, directoryWithoutTmp);
fs.rename(new Path(newDataDir), new Path(directoryWithoutTmp));
// Set snapshot on store to new value
LOGGER.info("Updating latest snapshot on store to {}", snapshot);
store.setLatestSnapshot(snapshot);
} catch (final IOException | StoreException e) {
throw new OperationException("IOException moving results files into new snapshot directory", e);
}
}
Aggregations