use of uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner in project Gaffer by gchq.
the class CalculatePartitionerTest method calculatePartitionerTest.
@Test
public void calculatePartitionerTest(@TempDir java.nio.file.Path tempDir) throws IOException {
// Given
final FileSystem fs = FileSystem.get(new Configuration());
final Schema schema = getSchema();
final SchemaUtils schemaUtils = new SchemaUtils(schema);
final String topLevelFolder = tempDir.toString();
writeData(topLevelFolder, schemaUtils);
// When
// - Calculate partitioner from files
final GraphPartitioner actual = new CalculatePartitioner(new Path(topLevelFolder), schema, fs).call();
// - Manually create the correct partitioner
final GraphPartitioner expected = new GraphPartitioner();
final List<PartitionKey> splitPointsEntity = new ArrayList<>();
for (int i = 1; i < 10; i++) {
splitPointsEntity.add(new PartitionKey(new Object[] { 10L * i }));
}
final GroupPartitioner groupPartitionerEntity = new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity);
expected.addGroupPartitioner(TestGroups.ENTITY, groupPartitionerEntity);
final GroupPartitioner groupPartitionerEntity2 = new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity);
expected.addGroupPartitioner(TestGroups.ENTITY_2, groupPartitionerEntity2);
final List<PartitionKey> splitPointsEdge = new ArrayList<>();
for (int i = 1; i < 10; i++) {
splitPointsEdge.add(new PartitionKey(new Object[] { 10L * i, 10L * i + 1, true }));
}
final GroupPartitioner groupPartitionerEdge = new GroupPartitioner(TestGroups.EDGE, splitPointsEdge);
expected.addGroupPartitioner(TestGroups.EDGE, groupPartitionerEdge);
final GroupPartitioner groupPartitionerEdge2 = new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge);
expected.addGroupPartitioner(TestGroups.EDGE_2, groupPartitionerEdge2);
final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
for (int i = 1; i < 10; i++) {
splitPointsReversedEdge.add(new PartitionKey(new Object[] { 10L * i + 1, 10L * i, true }));
}
final GroupPartitioner reversedGroupPartitionerEdge = new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge);
expected.addGroupPartitionerForReversedEdges(TestGroups.EDGE, reversedGroupPartitionerEdge);
final GroupPartitioner reversedGroupPartitionerEdge2 = new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge);
expected.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, reversedGroupPartitionerEdge2);
// Then
assertEquals(expected, actual);
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner in project Gaffer by gchq.
the class WriteUnsortedDataTest method testNoSplitPointsCase.
@Test
public void testNoSplitPointsCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
// Given
final String tempFilesDir = tempDir.toAbsolutePath().toString();
final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
final GraphPartitioner graphPartitioner = new GraphPartitioner();
graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, new ArrayList<>()));
graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, new ArrayList<>()));
graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, new ArrayList<>()));
graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, new ArrayList<>()));
graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, new ArrayList<>()));
graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, new ArrayList<>()));
final List<Element> elements = getData(3L);
final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
// When
writeUnsortedData.writeElements(elements);
// Then
// - Each directory should exist and contain one file
testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", 1);
testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
// - Each file should contain the data that was written to it, in the order it was in the iterable
testContainsCorrectDataNoSplitPoints(TestGroups.ENTITY, tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", elements, schemaUtils);
testContainsCorrectDataNoSplitPoints(TestGroups.ENTITY_2, tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", elements, schemaUtils);
testContainsCorrectDataNoSplitPoints(TestGroups.EDGE, tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", elements, schemaUtils);
testContainsCorrectDataNoSplitPoints(TestGroups.EDGE_2, tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", elements, schemaUtils);
testContainsCorrectDataNoSplitPoints(TestGroups.EDGE, tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", elements, schemaUtils);
final List<Element> elementsWithSameSrcDstRemoved = elements.stream().filter(e -> e.getGroup().equals(TestGroups.EDGE_2)).map(e -> (Edge) e).filter(e -> !e.getSource().equals(e.getDestination())).collect(Collectors.toList());
testContainsCorrectDataNoSplitPoints(TestGroups.EDGE_2, tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", elementsWithSameSrcDstRemoved, schemaUtils);
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner in project Gaffer by gchq.
the class GroupPartitionerSerialiserTest method shouldSerialiseKeysToFileAndReadCorrectly.
@Test
public void shouldSerialiseKeysToFileAndReadCorrectly(@TempDir Path tempDir) throws IOException {
// Given
final Object[] key1 = new Object[] { 1L, 5, "ABC", 10F, (short) 1, (byte) 64, new byte[] { (byte) 1, (byte) 2, (byte) 3 } };
final PartitionKey partitionKey1 = new PartitionKey(key1);
final Object[] key2 = new Object[] { 100L, 500, "XYZ", 1000F, (short) 3, (byte) 55, new byte[] { (byte) 10, (byte) 9, (byte) 8, (byte) 7 } };
final PartitionKey partitionKey2 = new PartitionKey(key2);
final List<PartitionKey> splitPoints = new ArrayList<>();
splitPoints.add(partitionKey1);
splitPoints.add(partitionKey2);
final GroupPartitioner groupPartitioner = new GroupPartitioner("GROUP", splitPoints);
final GroupPartitionerSerialiser serialiser = new GroupPartitionerSerialiser();
// When
final String filename = tempDir.resolve("test").toString();
final DataOutputStream dos = new DataOutputStream(new FileOutputStream(filename));
serialiser.write(groupPartitioner, dos);
dos.close();
final DataInputStream dis = new DataInputStream(new FileInputStream(filename));
final GroupPartitioner readGroupPartitioner = serialiser.read(dis);
dis.close();
// Then
assertEquals(groupPartitioner, readGroupPartitioner);
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner in project Gaffer by gchq.
the class ParquetStore method initialise.
private void initialise() throws IOException, StoreException {
// If data directory is empty or does not exist then this is the first time the store has been created.
final Path dataDirPath = new Path(getDataDir());
if (!fs.exists(dataDirPath) || 0 == fs.listStatus(dataDirPath).length) {
LOGGER.info("Data directory {} doesn't exist or is empty so initialising directory structure", dataDirPath);
currentSnapshot = System.currentTimeMillis();
LOGGER.info("Initialising snapshot id to {}", currentSnapshot);
final Path snapshotPath = new Path(dataDirPath, getSnapshotPath(currentSnapshot));
LOGGER.info("Creating snapshot directory {}", snapshotPath);
fs.mkdirs(snapshotPath);
LOGGER.info("Creating group directories under {}", snapshotPath);
for (final String group : getSchema().getGroups()) {
final Path groupDir = getGroupPath(group);
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
LOGGER.info("Creating group directories for reversed edges under {}", snapshotPath);
for (final String group : getSchema().getEdgeGroups()) {
final Path groupDir = getGroupPathForReversedEdges(group);
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
LOGGER.info("Creating GraphPartitioner with 0 split points for each group");
graphPartitioner = new GraphPartitioner();
for (final String group : getSchema().getGroups()) {
graphPartitioner.addGroupPartitioner(group, new GroupPartitioner(group, new ArrayList<>()));
}
for (final String group : getSchema().getEdgeGroups()) {
graphPartitioner.addGroupPartitionerForReversedEdges(group, new GroupPartitioner(group, new ArrayList<>()));
}
LOGGER.info("Writing GraphPartitioner to snapshot directory");
final FSDataOutputStream dataOutputStream = fs.create(getGraphPartitionerPath());
new GraphPartitionerSerialiser().write(graphPartitioner, dataOutputStream);
dataOutputStream.close();
LOGGER.info("Wrote GraphPartitioner to file {}", getGraphPartitionerPath().toString());
} else {
LOGGER.info("Data directory {} exists and is non-empty, validating a snapshot directory exists", dataDirPath);
final FileStatus[] fileStatuses = fs.listStatus(dataDirPath, f -> f.getName().startsWith(SNAPSHOT + "="));
final List<FileStatus> directories = Arrays.stream(fileStatuses).filter(f -> f.isDirectory()).collect(Collectors.toList());
if (0 == directories.size()) {
LOGGER.error("Data directory {} should contain a snapshot directory", dataDirPath);
throw new StoreException("Data directory should contain a snapshot directory");
}
this.currentSnapshot = getLatestSnapshot();
LOGGER.info("Latest snapshot directory in data directory {} is {}", dataDirPath, this.currentSnapshot);
LOGGER.info("Verifying snapshot directory contains the correct directories");
for (final String group : getSchema().getGroups()) {
final Path groupDir = getGroupPath(group);
if (!fs.exists(groupDir)) {
LOGGER.error("Directory {} should exist", groupDir);
throw new StoreException("Group directory " + groupDir + " should exist in snapshot directory " + getSnapshotPath(this.currentSnapshot));
}
}
for (final String group : getSchema().getEdgeGroups()) {
final Path groupDir = getGroupPathForReversedEdges(group);
if (!fs.exists(groupDir)) {
LOGGER.error("Directory {} should exist", groupDir);
throw new StoreException("Group directory " + groupDir + " should exist in snapshot directory " + getSnapshotPath(this.currentSnapshot));
}
}
}
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner in project Gaffer by gchq.
the class GroupPartitionerSerialiserTest method testWithInfinitePartitionKeys.
@Test
public void testWithInfinitePartitionKeys(@TempDir Path tempDir) throws IOException {
// Given
final GroupPartitioner groupPartitioner = new GroupPartitioner("GROUP", new ArrayList<>());
final GroupPartitionerSerialiser serialiser = new GroupPartitionerSerialiser();
// When
final String filename = tempDir.resolve("test").toString();
final DataOutputStream dos = new DataOutputStream(new FileOutputStream(filename));
serialiser.write(groupPartitioner, dos);
dos.close();
final DataInputStream dis = new DataInputStream(new FileInputStream(filename));
final GroupPartitioner readGroupPartitioner = serialiser.read(dis);
dis.close();
// Then
assertEquals(readGroupPartitioner, groupPartitioner);
}
Aggregations