Search in sources :

Example 6 with GraphPartitioner

use of uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner in project Gaffer by gchq.

the class QueryGenerator method getPathsForSeed.

private Set<PathInfo> getPathsForSeed(final ParquetElementSeed parquetElementSeed, final String group) {
    final GraphPartitioner graphPartitioner = store.getGraphPartitioner();
    final boolean isEntityGroup = store.getSchema().getEntityGroups().contains(group);
    final List<Object[]> seeds = new ArrayList<>();
    if (parquetElementSeed instanceof ParquetEntitySeed) {
        seeds.add(((ParquetEntitySeed) parquetElementSeed).getSeed());
    } else {
        final ParquetEdgeSeed edgeSeed = (ParquetEdgeSeed) parquetElementSeed;
        if (!isEntityGroup) {
            Object[] seed = new Object[edgeSeed.getSource().length + edgeSeed.getDestination().length];
            for (int i = 0; i < edgeSeed.getSource().length; i++) {
                seed[i] = edgeSeed.getSource()[i];
            }
            for (int i = edgeSeed.getSource().length; i < seed.length; i++) {
                seed[i] = edgeSeed.getDestination()[i - edgeSeed.getSource().length];
            }
            seeds.add(seed);
        } else {
            seeds.add(edgeSeed.getSource());
            seeds.add(edgeSeed.getDestination());
        }
    }
    final List<PathInfo> paths = new ArrayList<>();
    for (final Object[] seed : seeds) {
        final List<Integer> partitionIds = graphPartitioner.getGroupPartitioner(group).getPartitionIds(seed);
        LOGGER.debug("Partition ids for seed {} in group {}: {}", seed, group, partitionIds);
        final PathInfo.FILETYPE fileType = isEntityGroup ? PathInfo.FILETYPE.ENTITY : PathInfo.FILETYPE.EDGE;
        partitionIds.forEach(id -> paths.add(new PathInfo(new Path(store.getFile(group, id)), group, fileType)));
        if (!isEntityGroup && parquetElementSeed instanceof ParquetEntitySeed) {
            final List<Integer> partitionIdsFromReversed = graphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitionIds(seed);
            partitionIdsFromReversed.forEach(id -> paths.add(new PathInfo(new Path(store.getFileForReversedEdges(group, id)), group, PathInfo.FILETYPE.REVERSED_EDGE)));
        }
    }
    LOGGER.debug("Returning {} paths for seed {} and group {} (paths are {})", paths.size(), parquetElementSeed, group, paths);
    return paths.stream().collect(Collectors.toSet());
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)

Example 7 with GraphPartitioner

use of uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner in project Gaffer by gchq.

the class WriteUnsortedDataTest method testMultipleSplitPointsCase.

@Test
public void testMultipleSplitPointsCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split points are 10L and 100L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 11L
    // 12L
    // 100L
    // 100L
    // 200L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    splitPointsEntity.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(11L));
    elements.add(createEntityForEntityGroup(12L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(200L));
    // TestGroups.ENTITY_2, split points are 100L and 1000L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 200L
    // 1000L
    // 5000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 1000L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(200L));
    elements.add(createEntityForEntityGroup_2(1000L));
    elements.add(createEntityForEntityGroup_2(5000L));
    // TestGroups.EDGE, split points are [1000L, 200L, true] and [1000L, 30000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L        5000L         true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L       10000L         false
    // 1000L       30000L         false
    // 1000L      300000L         true
    // 10000L         400L         false
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 30000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 100L, 1000L, true }));
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 300L, 2000L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 90L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 10000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 30000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 300000L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    // TestGroups.EDGE_2, split points are [10L, 2000L, true] and [100L, 1000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        false
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 100L, 1000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 1000L, 1500L, true }));
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 2000L, 2500L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, false));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 1000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0, split1 and split2 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-2", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-2", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 8 with GraphPartitioner

use of uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner in project Gaffer by gchq.

the class WriteUnsortedDataTest method testOneSplitPointCase.

@Test
public void testOneSplitPointCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split point is 10L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 10L
    // 20L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(20L));
    // TestGroups.ENTITY_2, split point is 100L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 1000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(1000L));
    // TestGroups.EDGE, split point is [1000L, 200L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L         200L         false
    // 1000L         200L         true
    // 1000L         300L         true
    // 10000L         400L         false
    // 10000L         400L         true
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 1000L, 300L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, true));
    // TestGroups.EDGE_2, split point is [10L, 2000L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        true
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 3000L, 20L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0 and split1 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 9 with GraphPartitioner

use of uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner in project Gaffer by gchq.

the class GraphPartitionerSerialiserTest method shouldGroupMultiplePartitionKeysAndSerialiseCorrectly.

@Test
public void shouldGroupMultiplePartitionKeysAndSerialiseCorrectly(@TempDir Path tempDir) throws IOException {
    // Given
    final Object[] key1 = new Object[] { 1L, 5, "ABC", 10F, (short) 1, (byte) 64, new byte[] { (byte) 1, (byte) 2, (byte) 3 } };
    final PartitionKey partitionKey1 = new PartitionKey(key1);
    final Object[] key2 = new Object[] { 100L, 500, "XYZ", 1000F, (short) 3, (byte) 55, new byte[] { (byte) 10, (byte) 9, (byte) 8, (byte) 7 } };
    final PartitionKey partitionKey2 = new PartitionKey(key2);
    final List<PartitionKey> splitPoints1 = new ArrayList<>();
    splitPoints1.add(partitionKey1);
    splitPoints1.add(partitionKey2);
    final GroupPartitioner groupPartitioner1 = new GroupPartitioner("GROUP", splitPoints1);
    final Object[] key3 = new Object[] { 1000L, 5000, "ABCDEF", 10000F, (short) 19, (byte) 20, new byte[] { (byte) 4, (byte) 5, (byte) 6 } };
    final PartitionKey partitionKey3 = new PartitionKey(key3);
    final Object[] key4 = new Object[] { 100000L, 500000, "XYZZZZ", 100000F, (short) 32, (byte) 58, new byte[] { (byte) 20, (byte) 29, (byte) 28, (byte) 27 } };
    final PartitionKey partitionKey4 = new PartitionKey(key4);
    final List<PartitionKey> splitPoints2 = new ArrayList<>();
    splitPoints2.add(partitionKey3);
    splitPoints2.add(partitionKey4);
    final GroupPartitioner groupPartitioner2 = new GroupPartitioner("GROUP2", splitPoints2);
    final Object[] key5 = new Object[] { 10000000L, 5000000, "ABCDEFGHI", 100000F, (short) 21, (byte) 30, new byte[] { (byte) 10, (byte) 11, (byte) 12 } };
    final PartitionKey partitionKey5 = new PartitionKey(key5);
    final Object[] key6 = new Object[] { 100000000L, 5000, "ABCDEF", 10000F, (short) 19, (byte) 33, new byte[] { (byte) 13, (byte) 14, (byte) 15 } };
    final PartitionKey partitionKey6 = new PartitionKey(key6);
    final List<PartitionKey> splitPoints3 = new ArrayList<>();
    splitPoints3.add(partitionKey5);
    splitPoints3.add(partitionKey6);
    final GroupPartitioner groupPartitioner3 = new GroupPartitioner("GROUP1", splitPoints3);
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    graphPartitioner.addGroupPartitioner("GROUP1", groupPartitioner1);
    graphPartitioner.addGroupPartitioner("GROUP2", groupPartitioner2);
    graphPartitioner.addGroupPartitionerForReversedEdges("GROUP1", groupPartitioner3);
    final GraphPartitionerSerialiser serialiser = new GraphPartitionerSerialiser();
    // When
    final String filename = tempDir.resolve("test").toString();
    final DataOutputStream dos = new DataOutputStream(new FileOutputStream(filename));
    serialiser.write(graphPartitioner, dos);
    dos.close();
    final DataInputStream dis = new DataInputStream(new FileInputStream(filename));
    final GraphPartitioner readGraphPartitioner = serialiser.read(dis);
    dis.close();
    // Then
    assertEquals(graphPartitioner, readGraphPartitioner);
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) DataOutputStream(java.io.DataOutputStream) ArrayList(java.util.ArrayList) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) FileOutputStream(java.io.FileOutputStream) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Test(org.junit.jupiter.api.Test)

Example 10 with GraphPartitioner

use of uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner in project Gaffer by gchq.

the class CalculatePartitioner method call.

public GraphPartitioner call() throws IOException {
    final SchemaUtils schemaUtils = new SchemaUtils(schema);
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    for (final String group : schema.getGroups()) {
        LOGGER.info("Calculating GroupPartitioner for group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, false));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(schema.getEntityGroups().contains(group)).usingConverter(converter).build();
            // NB Should never be null as empty files are removed before this is called
            final Element element = reader.read();
            if (null == element) {
                throw new IOException("No first element in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjects(element);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitioner(group, groupPartitioner);
        LOGGER.info("GroupPartitioner for group {} is {}", group, groupPartitioner);
    }
    for (final String group : schema.getEdgeGroups()) {
        LOGGER.info("Calculating GroupPartitioner for reversed edge group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, true));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(false).usingConverter(converter).build();
            final Edge edge = (Edge) reader.read();
            if (null == edge) {
                throw new IOException("No first edge in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjectsForReversedEdge(edge);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitionerForReversedEdges(group, groupPartitioner);
    }
    return graphPartitioner;
}
Also used : Arrays(java.util.Arrays) Logger(org.slf4j.Logger) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) SortedSet(java.util.SortedSet) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReader(org.apache.parquet.hadoop.ParquetReader) LoggerFactory(org.slf4j.LoggerFactory) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Element(uk.gov.gchq.gaffer.data.element.Element) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) TreeSet(java.util.TreeSet) Path(org.apache.hadoop.fs.Path) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) IOException(java.io.IOException) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Edge(uk.gov.gchq.gaffer.data.element.Edge)

Aggregations

GraphPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)12 Path (org.apache.hadoop.fs.Path)10 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)8 FileSystem (org.apache.hadoop.fs.FileSystem)7 GroupPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner)7 List (java.util.List)6 FileStatus (org.apache.hadoop.fs.FileStatus)6 PartitionKey (uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey)6 Arrays (java.util.Arrays)5 Configuration (org.apache.hadoop.conf.Configuration)5 Test (org.junit.jupiter.api.Test)5 Element (uk.gov.gchq.gaffer.data.element.Element)5 OperationException (uk.gov.gchq.gaffer.operation.OperationException)5 BiFunction (java.util.function.BiFunction)4 Collectors (java.util.stream.Collectors)4 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)4 WriteUnsortedData (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData)4 GraphPartitionerSerialiser (uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser)4 ParseException (java.text.ParseException)3