Search in sources :

Example 6 with PartitionKey

use of uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey in project Gaffer by gchq.

the class WriteUnsortedData method writeElement.

private void writeElement(final Element element) throws IOException {
    final String group = element.getGroup();
    final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
    // Get partition
    final PartitionKey partitionKey = new PartitionKey(converter.corePropertiesToParquetObjects(element));
    final int partition = graphPartitioner.getGroupPartitioner(group).getPartitionId(partitionKey);
    // Get writer
    final ParquetWriter<Element> writer = getWriter(partition, group, false);
    if (null != writer) {
        writer.write(element);
    } else {
        LOGGER.warn("Skipped the addition of an Element of group {} as that group does not exist in the schema.", group);
    }
}
Also used : Element(uk.gov.gchq.gaffer.data.element.Element) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey)

Example 7 with PartitionKey

use of uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey in project Gaffer by gchq.

the class WriteUnsortedData method writeEdgeReversed.

private void writeEdgeReversed(final Edge edge) throws IOException {
    // Also write out edges partitioned as in the directory sorted by destination
    final String group = edge.getGroup();
    final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
    // Get partition
    final PartitionKey partitionKey = new PartitionKey(converter.corePropertiesToParquetObjectsForReversedEdge(edge));
    final int partition = graphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitionId(partitionKey);
    // Get writer
    final ParquetWriter<Element> writer = getWriter(partition, group, true);
    if (null != writer) {
        writer.write(edge);
    } else {
        LOGGER.warn("Skipped the addition of an Element of group {} as that group does not exist in the schema.", group);
    }
}
Also used : Element(uk.gov.gchq.gaffer.data.element.Element) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey)

Example 8 with PartitionKey

use of uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey in project Gaffer by gchq.

the class WriteUnsortedDataTest method testMultipleSplitPointsCase.

@Test
public void testMultipleSplitPointsCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split points are 10L and 100L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 11L
    // 12L
    // 100L
    // 100L
    // 200L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    splitPointsEntity.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(11L));
    elements.add(createEntityForEntityGroup(12L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(200L));
    // TestGroups.ENTITY_2, split points are 100L and 1000L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 200L
    // 1000L
    // 5000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 1000L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(200L));
    elements.add(createEntityForEntityGroup_2(1000L));
    elements.add(createEntityForEntityGroup_2(5000L));
    // TestGroups.EDGE, split points are [1000L, 200L, true] and [1000L, 30000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L        5000L         true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L       10000L         false
    // 1000L       30000L         false
    // 1000L      300000L         true
    // 10000L         400L         false
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 30000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 100L, 1000L, true }));
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 300L, 2000L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 90L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 10000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 30000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 300000L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    // TestGroups.EDGE_2, split points are [10L, 2000L, true] and [100L, 1000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        false
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 100L, 1000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 1000L, 1500L, true }));
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 2000L, 2500L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, false));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 1000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0, split1 and split2 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-2", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-2", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 9 with PartitionKey

use of uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey in project Gaffer by gchq.

the class WriteUnsortedDataTest method testOneSplitPointCase.

@Test
public void testOneSplitPointCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split point is 10L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 10L
    // 20L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(20L));
    // TestGroups.ENTITY_2, split point is 100L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 1000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(1000L));
    // TestGroups.EDGE, split point is [1000L, 200L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L         200L         false
    // 1000L         200L         true
    // 1000L         300L         true
    // 10000L         400L         false
    // 10000L         400L         true
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 1000L, 300L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, true));
    // TestGroups.EDGE_2, split point is [10L, 2000L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        true
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 3000L, 20L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0 and split1 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 10 with PartitionKey

use of uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey in project Gaffer by gchq.

the class GraphPartitionerSerialiserTest method shouldGroupMultiplePartitionKeysAndSerialiseCorrectly.

@Test
public void shouldGroupMultiplePartitionKeysAndSerialiseCorrectly(@TempDir Path tempDir) throws IOException {
    // Given
    final Object[] key1 = new Object[] { 1L, 5, "ABC", 10F, (short) 1, (byte) 64, new byte[] { (byte) 1, (byte) 2, (byte) 3 } };
    final PartitionKey partitionKey1 = new PartitionKey(key1);
    final Object[] key2 = new Object[] { 100L, 500, "XYZ", 1000F, (short) 3, (byte) 55, new byte[] { (byte) 10, (byte) 9, (byte) 8, (byte) 7 } };
    final PartitionKey partitionKey2 = new PartitionKey(key2);
    final List<PartitionKey> splitPoints1 = new ArrayList<>();
    splitPoints1.add(partitionKey1);
    splitPoints1.add(partitionKey2);
    final GroupPartitioner groupPartitioner1 = new GroupPartitioner("GROUP", splitPoints1);
    final Object[] key3 = new Object[] { 1000L, 5000, "ABCDEF", 10000F, (short) 19, (byte) 20, new byte[] { (byte) 4, (byte) 5, (byte) 6 } };
    final PartitionKey partitionKey3 = new PartitionKey(key3);
    final Object[] key4 = new Object[] { 100000L, 500000, "XYZZZZ", 100000F, (short) 32, (byte) 58, new byte[] { (byte) 20, (byte) 29, (byte) 28, (byte) 27 } };
    final PartitionKey partitionKey4 = new PartitionKey(key4);
    final List<PartitionKey> splitPoints2 = new ArrayList<>();
    splitPoints2.add(partitionKey3);
    splitPoints2.add(partitionKey4);
    final GroupPartitioner groupPartitioner2 = new GroupPartitioner("GROUP2", splitPoints2);
    final Object[] key5 = new Object[] { 10000000L, 5000000, "ABCDEFGHI", 100000F, (short) 21, (byte) 30, new byte[] { (byte) 10, (byte) 11, (byte) 12 } };
    final PartitionKey partitionKey5 = new PartitionKey(key5);
    final Object[] key6 = new Object[] { 100000000L, 5000, "ABCDEF", 10000F, (short) 19, (byte) 33, new byte[] { (byte) 13, (byte) 14, (byte) 15 } };
    final PartitionKey partitionKey6 = new PartitionKey(key6);
    final List<PartitionKey> splitPoints3 = new ArrayList<>();
    splitPoints3.add(partitionKey5);
    splitPoints3.add(partitionKey6);
    final GroupPartitioner groupPartitioner3 = new GroupPartitioner("GROUP1", splitPoints3);
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    graphPartitioner.addGroupPartitioner("GROUP1", groupPartitioner1);
    graphPartitioner.addGroupPartitioner("GROUP2", groupPartitioner2);
    graphPartitioner.addGroupPartitionerForReversedEdges("GROUP1", groupPartitioner3);
    final GraphPartitionerSerialiser serialiser = new GraphPartitionerSerialiser();
    // When
    final String filename = tempDir.resolve("test").toString();
    final DataOutputStream dos = new DataOutputStream(new FileOutputStream(filename));
    serialiser.write(graphPartitioner, dos);
    dos.close();
    final DataInputStream dis = new DataInputStream(new FileInputStream(filename));
    final GraphPartitioner readGraphPartitioner = serialiser.read(dis);
    dis.close();
    // Then
    assertEquals(graphPartitioner, readGraphPartitioner);
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) DataOutputStream(java.io.DataOutputStream) ArrayList(java.util.ArrayList) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) FileOutputStream(java.io.FileOutputStream) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Test(org.junit.jupiter.api.Test)

Aggregations

PartitionKey (uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey)12 Test (org.junit.jupiter.api.Test)8 ArrayList (java.util.ArrayList)6 GroupPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner)6 DataInputStream (java.io.DataInputStream)5 DataOutputStream (java.io.DataOutputStream)5 FileInputStream (java.io.FileInputStream)5 FileOutputStream (java.io.FileOutputStream)5 Element (uk.gov.gchq.gaffer.data.element.Element)5 GraphPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 IOException (java.io.IOException)3 Arrays (java.util.Arrays)3 List (java.util.List)3 Configuration (org.apache.hadoop.conf.Configuration)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 ParquetReader (org.apache.parquet.hadoop.ParquetReader)3 Edge (uk.gov.gchq.gaffer.data.element.Edge)3 ParquetElementReader (uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader)3