Search in sources :

Example 1 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class CalculatePartitionerTest method calculatePartitionerTest.

@Test
public void calculatePartitionerTest(@TempDir java.nio.file.Path tempDir) throws IOException {
    // Given
    final FileSystem fs = FileSystem.get(new Configuration());
    final Schema schema = getSchema();
    final SchemaUtils schemaUtils = new SchemaUtils(schema);
    final String topLevelFolder = tempDir.toString();
    writeData(topLevelFolder, schemaUtils);
    // When
    // - Calculate partitioner from files
    final GraphPartitioner actual = new CalculatePartitioner(new Path(topLevelFolder), schema, fs).call();
    // - Manually create the correct partitioner
    final GraphPartitioner expected = new GraphPartitioner();
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    for (int i = 1; i < 10; i++) {
        splitPointsEntity.add(new PartitionKey(new Object[] { 10L * i }));
    }
    final GroupPartitioner groupPartitionerEntity = new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity);
    expected.addGroupPartitioner(TestGroups.ENTITY, groupPartitionerEntity);
    final GroupPartitioner groupPartitionerEntity2 = new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity);
    expected.addGroupPartitioner(TestGroups.ENTITY_2, groupPartitionerEntity2);
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    for (int i = 1; i < 10; i++) {
        splitPointsEdge.add(new PartitionKey(new Object[] { 10L * i, 10L * i + 1, true }));
    }
    final GroupPartitioner groupPartitionerEdge = new GroupPartitioner(TestGroups.EDGE, splitPointsEdge);
    expected.addGroupPartitioner(TestGroups.EDGE, groupPartitionerEdge);
    final GroupPartitioner groupPartitionerEdge2 = new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge);
    expected.addGroupPartitioner(TestGroups.EDGE_2, groupPartitionerEdge2);
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    for (int i = 1; i < 10; i++) {
        splitPointsReversedEdge.add(new PartitionKey(new Object[] { 10L * i + 1, 10L * i, true }));
    }
    final GroupPartitioner reversedGroupPartitionerEdge = new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge);
    expected.addGroupPartitionerForReversedEdges(TestGroups.EDGE, reversedGroupPartitionerEdge);
    final GroupPartitioner reversedGroupPartitionerEdge2 = new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge);
    expected.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, reversedGroupPartitionerEdge2);
    // Then
    assertEquals(expected, actual);
}
Also used : Path(org.apache.hadoop.fs.Path) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Configuration(org.apache.hadoop.conf.Configuration) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) FileSystem(org.apache.hadoop.fs.FileSystem) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Test(org.junit.jupiter.api.Test)

Example 2 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class QueryGeneratorTest method testQueryGeneratorForGetAllElements.

@Test
public void testQueryGeneratorForGetAllElements(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    // - Create snapshot folder
    final String folder = String.format("file:///%s", tempDir.toString());
    final String snapshotFolder = folder + "/" + ParquetStore.getSnapshotPath(1000L);
    // - Write out Parquet files so know the partitioning
    CalculatePartitionerTest.writeData(snapshotFolder, new SchemaUtils(schema));
    // - Initialise store
    final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
    storeProperties.setDataDir(folder);
    storeProperties.setTempFilesDir(folder + "/tmpdata");
    final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
    // When 1 - no view
    GetAllElements getAllElements = new GetAllElements.Builder().build();
    ParquetQuery query = new QueryGenerator(store).getParquetQuery(getAllElements);
    // Then 1
    final List expected = new ArrayList<>();
    for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2, TestGroups.EDGE, TestGroups.EDGE_2)) {
        final Path groupFolderPath = store.getGroupPath(group);
        for (int partition = 0; partition < 10; partition++) {
            final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
            expected.add(new ParquetFileQuery(pathForPartitionFile, null, true));
        }
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 2 - simple view that restricts to one group
    getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE).build()).build();
    query = new QueryGenerator(store).getParquetQuery(getAllElements);
    // Then 2
    expected.clear();
    Path groupFolderPath = store.getGroupPath(TestGroups.EDGE);
    for (int partition = 0; partition < 10; partition++) {
        final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
        expected.add(new ParquetFileQuery(pathForPartitionFile, null, true));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 3 - view with filter that can be pushed down to Parquet
    getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsMoreThan(10)).build()).build()).build()).build();
    query = new QueryGenerator(store).getParquetQuery(getAllElements);
    // Then 3
    expected.clear();
    for (int partition = 0; partition < 10; partition++) {
        final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
        expected.add(new ParquetFileQuery(pathForPartitionFile, gt(FilterApi.intColumn("count"), 10), true));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 4 - view with filter that can't be pushed down to Parquet
    getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsEvenFilter()).build()).build()).build()).build();
    query = new QueryGenerator(store).getParquetQuery(getAllElements);
    // Then 4
    expected.clear();
    for (int partition = 0; partition < 10; partition++) {
        final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
        expected.add(new ParquetFileQuery(pathForPartitionFile, null, false));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 5 - view with one filter that can be pushed down and one that can't
    getAllElements = new GetAllElements.Builder().view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsEvenFilter()).select("count").execute(new IsMoreThan(10)).build()).build()).build()).build();
    query = new QueryGenerator(store).getParquetQuery(getAllElements);
    // Then 5
    expected.clear();
    for (int partition = 0; partition < 10; partition++) {
        final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(partition));
        expected.add(new ParquetFileQuery(pathForPartitionFile, gt(FilterApi.intColumn("count"), 10), false));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
}
Also used : ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) ViewElementDefinition(uk.gov.gchq.gaffer.data.elementdefinition.view.ViewElementDefinition) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) GetAllElements(uk.gov.gchq.gaffer.operation.impl.get.GetAllElements) ArrayList(java.util.ArrayList) List(java.util.List) IsMoreThan(uk.gov.gchq.koryphe.impl.predicate.IsMoreThan) CalculatePartitionerTest(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitionerTest) LongVertexOperationsTest(uk.gov.gchq.gaffer.parquetstore.operation.handler.LongVertexOperationsTest) Test(org.junit.jupiter.api.Test)

Example 3 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class QueryGeneratorTest method testQueryGeneratorForGetElementsWithEntitySeeds.

@Test
public void testQueryGeneratorForGetElementsWithEntitySeeds(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    // - Create snapshot folder
    final String folder = String.format("file:///%s", tempDir.toString());
    final String snapshotFolder = folder + "/" + ParquetStore.getSnapshotPath(1000L);
    // - Write out Parquet files so know the partitioning
    CalculatePartitionerTest.writeData(snapshotFolder, new SchemaUtils(schema));
    // - Initialise store
    final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
    storeProperties.setDataDir(folder);
    storeProperties.setTempFilesDir(folder + "/tmpdata");
    final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
    // When 1 - no view, query for vertex 0
    GetElements getElements = new GetElements.Builder().input(new EntitySeed(0L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).build();
    ParquetQuery query = new QueryGenerator(store).getParquetQuery(getElements);
    // Then 1
    final List expected = new ArrayList<>();
    final FilterPredicate vertex0 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 0L);
    final FilterPredicate source0 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 0L);
    final FilterPredicate destination0 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 0L);
    for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2)) {
        final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
        final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(0));
        expected.add(new ParquetFileQuery(pathForPartitionFile, vertex0, true));
    }
    for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
        final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
        final Path pathForPartitionFile = new Path(groupFolderPath, ParquetStore.getFile(0));
        expected.add(new ParquetFileQuery(pathForPartitionFile, source0, true));
        final Path reversedGroupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, true));
        final Path pathForReversedPartitionFile = new Path(reversedGroupFolderPath, ParquetStore.getFile(0));
        expected.add(new ParquetFileQuery(pathForReversedPartitionFile, destination0, true));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 2 - no view, query for vertices 0 and 1000000
    getElements = new GetElements.Builder().input(new EntitySeed(0L), new EntitySeed(1000000L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).build();
    query = new QueryGenerator(store).getParquetQuery(getElements);
    // Then 2
    expected.clear();
    final FilterPredicate vertex1000000 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 1000000L);
    final FilterPredicate source1000000 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 1000000L);
    final FilterPredicate destination1000000 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1000000L);
    for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2)) {
        final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
        final Path pathForPartitionFile1 = new Path(groupFolderPath, ParquetStore.getFile(0));
        expected.add(new ParquetFileQuery(pathForPartitionFile1, vertex0, true));
        final Path pathForPartitionFile2 = new Path(groupFolderPath, ParquetStore.getFile(9));
        expected.add(new ParquetFileQuery(pathForPartitionFile2, vertex1000000, true));
    }
    for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
        final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
        final Path reversedGroupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, true));
        // Partition 0, vertex 0L
        final Path pathForPartitionFile1 = new Path(groupFolderPath, ParquetStore.getFile(0));
        expected.add(new ParquetFileQuery(pathForPartitionFile1, source0, true));
        // Partition 9, vertex 1000000L
        final Path pathForPartitionFile2 = new Path(groupFolderPath, ParquetStore.getFile(9));
        expected.add(new ParquetFileQuery(pathForPartitionFile2, source1000000, true));
        // Partition 0 of reversed, vertex 0L
        final Path pathForPartitionFile3 = new Path(reversedGroupFolderPath, ParquetStore.getFile(0));
        expected.add(new ParquetFileQuery(pathForPartitionFile3, destination0, true));
        // Partition 9 of reversed, vertex 1000000L
        final Path pathForPartitionFile4 = new Path(reversedGroupFolderPath, ParquetStore.getFile(9));
        expected.add(new ParquetFileQuery(pathForPartitionFile4, destination1000000, true));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 3 - view with filter that can be pushed down to Parquet, query for vertices 0 and 1000000
    getElements = new GetElements.Builder().input(new EntitySeed(0L), new EntitySeed(1000000L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsMoreThan(10)).build()).build()).build()).build();
    query = new QueryGenerator(store).getParquetQuery(getElements);
    // Then 3
    expected.clear();
    final FilterPredicate source0AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.SOURCE), 0L));
    final FilterPredicate source1000000AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.SOURCE), 1000000L));
    final FilterPredicate destination0AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.DESTINATION), 0L));
    final FilterPredicate destination1000000AndCount = and(gt(FilterApi.intColumn("count"), 10), eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1000000L));
    final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(TestGroups.EDGE, false));
    final Path reversedGroupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(TestGroups.EDGE, true));
    // Partition 0, vertex 0L
    final Path pathForPartitionFile1 = new Path(groupFolderPath, ParquetStore.getFile(0));
    expected.add(new ParquetFileQuery(pathForPartitionFile1, source0AndCount, true));
    // Partition 9, vertex 1000000L
    final Path pathForPartitionFile2 = new Path(groupFolderPath, ParquetStore.getFile(9));
    expected.add(new ParquetFileQuery(pathForPartitionFile2, source1000000AndCount, true));
    // Partition 0 of reversed, vertex 0L
    final Path pathForPartitionFile3 = new Path(reversedGroupFolderPath, ParquetStore.getFile(0));
    expected.add(new ParquetFileQuery(pathForPartitionFile3, destination0AndCount, true));
    // Partition 9 of reversed, vertex 1000000L
    final Path pathForPartitionFile4 = new Path(reversedGroupFolderPath, ParquetStore.getFile(9));
    expected.add(new ParquetFileQuery(pathForPartitionFile4, destination1000000AndCount, true));
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 4 - view with filter that can't be pushed down to Parquet, query for vertices 0 and 1000000
    getElements = new GetElements.Builder().input(new EntitySeed(0L), new EntitySeed(1000000L)).seedMatching(SeedMatching.SeedMatchingType.RELATED).view(new View.Builder().edge(TestGroups.EDGE, new ViewElementDefinition.Builder().preAggregationFilter(new ElementFilter.Builder().select("count").execute(new IsEvenFilter()).build()).build()).build()).build();
    query = new QueryGenerator(store).getParquetQuery(getElements);
    // Then 4
    expected.clear();
    // Partition 0, vertex 0L
    expected.add(new ParquetFileQuery(pathForPartitionFile1, source0, false));
    // Partition 9, vertex 1000000L
    expected.add(new ParquetFileQuery(pathForPartitionFile2, source1000000, false));
    // Partition 0 of reversed, vertex 0L
    expected.add(new ParquetFileQuery(pathForPartitionFile3, destination0, false));
    // Partition 9 of reversed, vertex 1000000L
    expected.add(new ParquetFileQuery(pathForPartitionFile4, destination1000000, false));
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
}
Also used : ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) GetElements(uk.gov.gchq.gaffer.operation.impl.get.GetElements) ViewElementDefinition(uk.gov.gchq.gaffer.data.elementdefinition.view.ViewElementDefinition) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) EntitySeed(uk.gov.gchq.gaffer.operation.data.EntitySeed) ArrayList(java.util.ArrayList) List(java.util.List) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) IsMoreThan(uk.gov.gchq.koryphe.impl.predicate.IsMoreThan) CalculatePartitionerTest(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitionerTest) LongVertexOperationsTest(uk.gov.gchq.gaffer.parquetstore.operation.handler.LongVertexOperationsTest) Test(org.junit.jupiter.api.Test)

Example 4 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class QueryGeneratorTest method testQueryGeneratorForGetElementsWithEdgeSeeds.

@Test
public void testQueryGeneratorForGetElementsWithEdgeSeeds(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    // - Create snapshot folder
    final String folder = String.format("file:///%s", tempDir.toString());
    final String snapshotFolder = folder + "/" + ParquetStore.getSnapshotPath(1000L);
    // - Write out Parquet files so know the partitioning
    CalculatePartitionerTest.writeData(snapshotFolder, new SchemaUtils(schema));
    // - Initialise store
    final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
    storeProperties.setDataDir(folder);
    storeProperties.setTempFilesDir(folder + "/tmpdata");
    final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
    // When 1 - no view, query for edges 0->1, 10--10000, 10000--10 with seed matching type set to EQUAL
    GetElements getElements = new GetElements.Builder().input(new EdgeSeed(0L, 1L, DirectedType.DIRECTED), new EdgeSeed(10L, 1000L, DirectedType.UNDIRECTED), new EdgeSeed(10000L, 10L, DirectedType.EITHER)).seedMatching(SeedMatching.SeedMatchingType.EQUAL).build();
    ParquetQuery query = new QueryGenerator(store).getParquetQuery(getElements);
    // Then 1
    final List expected = new ArrayList<>();
    final FilterPredicate source0 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 0L);
    final FilterPredicate source10 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 10L);
    final FilterPredicate source10000 = eq(FilterApi.longColumn(ParquetStore.SOURCE), 10000L);
    final FilterPredicate destination1 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1L);
    final FilterPredicate destination10 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 10L);
    final FilterPredicate destination1000 = eq(FilterApi.longColumn(ParquetStore.DESTINATION), 1000L);
    final FilterPredicate directedTrue = eq(FilterApi.booleanColumn(ParquetStore.DIRECTED), true);
    final FilterPredicate directedFalse = eq(FilterApi.booleanColumn(ParquetStore.DIRECTED), false);
    final FilterPredicate source0Destination1DirectedTrue = and(and(source0, destination1), directedTrue);
    final FilterPredicate source10Destination1000DirectedFalse = and(and(source10, destination1000), directedFalse);
    final FilterPredicate source10000Destination10DirectedEither = and(source10000, destination10);
    for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
        final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
        // 0->1 partition 0 of forward
        final Path pathForPartition0File = new Path(groupFolderPath, ParquetStore.getFile(0));
        // Comment here that don't need to look in the reversed directory
        expected.add(new ParquetFileQuery(pathForPartition0File, source0Destination1DirectedTrue, true));
        // 10--1000 partition 1 of forward
        final Path pathForPartition1File = new Path(groupFolderPath, ParquetStore.getFile(1));
        // Comment here that don't need to look in the reversed directory
        expected.add(new ParquetFileQuery(pathForPartition1File, source10Destination1000DirectedFalse, true));
        // 10000--10 partition 9 of forward
        final Path pathForPartition9File = new Path(groupFolderPath, ParquetStore.getFile(9));
        // Comment here that don't need to look in the reversed directory
        expected.add(new ParquetFileQuery(pathForPartition9File, source10000Destination10DirectedEither, true));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
    // When 2 - no view, query for edges 0->1, 10--10000, 10000--10 with seed matching type set to RELATED
    getElements = new GetElements.Builder().input(new EdgeSeed(0L, 1L, DirectedType.DIRECTED), new EdgeSeed(10L, 1000L, DirectedType.UNDIRECTED), new EdgeSeed(10000L, 10L, DirectedType.EITHER)).seedMatching(SeedMatching.SeedMatchingType.RELATED).build();
    query = new QueryGenerator(store).getParquetQuery(getElements);
    // Then 2
    expected.clear();
    final FilterPredicate vertex0 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 0L);
    final FilterPredicate vertex1 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 1L);
    final FilterPredicate vertex10 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 10L);
    final FilterPredicate vertex1000 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 1000L);
    final FilterPredicate vertex10000 = eq(FilterApi.longColumn(ParquetStore.VERTEX), 10000L);
    final FilterPredicate vertex0or1 = or(vertex0, vertex1);
    final FilterPredicate vertex10or1000 = or(vertex10, vertex1000);
    final FilterPredicate vertex10000or10 = or(vertex10000, vertex10);
    for (final String group : Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2)) {
        final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
        // 0 and 1 in partition 0
        final Path pathForPartition0File = new Path(groupFolderPath, ParquetStore.getFile(0));
        expected.add(new ParquetFileQuery(pathForPartition0File, vertex0or1, true));
        // 10 or 1000 and 10000 or 10 in partition 1 (NB 1000 and 10000 't appear in partition 1 but this doesn't cause any incorrect results, and will be fixed in later versions)
        final Path pathForPartition1File = new Path(groupFolderPath, ParquetStore.getFile(1));
        expected.add(new ParquetFileQuery(pathForPartition1File, or(vertex10or1000, vertex10000or10), true));
        // 10 or 1000 and 1000 or 10000 in partition 9
        final Path pathForPartition9File = new Path(groupFolderPath, ParquetStore.getFile(9));
        expected.add(new ParquetFileQuery(pathForPartition9File, or(vertex10or1000, vertex10000or10), true));
    }
    for (final String group : Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2)) {
        final Path groupFolderPath = new Path(snapshotFolder, ParquetStore.getGroupSubDir(group, false));
        // 0->1 partition 0 of forward
        final Path pathForPartition0File = new Path(groupFolderPath, ParquetStore.getFile(0));
        // Comment here that don't need to look in the reversed directory
        expected.add(new ParquetFileQuery(pathForPartition0File, source0Destination1DirectedTrue, true));
        // 10--1000 partition 1 of forward
        final Path pathForPartition1File = new Path(groupFolderPath, ParquetStore.getFile(1));
        // Comment here that don't need to look in the reversed directory
        expected.add(new ParquetFileQuery(pathForPartition1File, source10Destination1000DirectedFalse, true));
        // 10000--10 partition 9 of forward
        final Path pathForPartition9File = new Path(groupFolderPath, ParquetStore.getFile(9));
        // Comment here that don't need to look in the reversed directory
        expected.add(new ParquetFileQuery(pathForPartition9File, source10000Destination10DirectedEither, true));
    }
    assertThat(expected).containsOnly(query.getAllParquetFileQueries().toArray());
}
Also used : ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) GetElements(uk.gov.gchq.gaffer.operation.impl.get.GetElements) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) EdgeSeed(uk.gov.gchq.gaffer.operation.data.EdgeSeed) ArrayList(java.util.ArrayList) List(java.util.List) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) CalculatePartitionerTest(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitionerTest) LongVertexOperationsTest(uk.gov.gchq.gaffer.parquetstore.operation.handler.LongVertexOperationsTest) Test(org.junit.jupiter.api.Test)

Example 5 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class AddElementsHandler method addElements.

private void addElements(final AddElements addElementsOperation, final Context context, final ParquetStore store) throws OperationException {
    // Set up
    final FileSystem fs = store.getFS();
    final Schema schema = store.getSchema();
    final SchemaUtils schemaUtils = store.getSchemaUtils();
    final SparkSession spark = SparkContextUtil.getSparkSession(context, store.getProperties());
    final ExecutorService threadPool = createThreadPool(spark, store.getProperties());
    final GraphPartitioner currentGraphPartitioner = store.getGraphPartitioner();
    SparkParquetUtils.configureSparkForAddElements(spark, store.getProperties());
    // Write data from addElementsOperation split by group and partition (NB this uses the existing partitioner -
    // adding elements using this operation does not effect the partitions).
    final String tmpDirectory = store.getTempFilesDir();
    final BiFunction<String, Integer, String> directoryForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/group=" + group + "/partition=" + partitionId;
    final BiFunction<String, Integer, String> directoryForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/reversed-group=" + group + "/partition=" + partitionId;
    LOGGER.info("Calling WriteUnsortedData to add elements");
    LOGGER.trace("currentGraphPartitioner is {}", currentGraphPartitioner);
    new WriteUnsortedData(store, currentGraphPartitioner, directoryForGroupAndPartitionId, directoryForGroupAndPartitionIdForReversedEdges).writeElements(addElementsOperation.getInput());
    // For every group and partition, aggregate the new data with the old data and then sort
    final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/group=" + group + "/partition=" + partitionId;
    final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/REVERSED-group=" + group + "/partition=" + partitionId;
    final List<Callable<CallableResult>> tasks = new ArrayList<>();
    for (final String group : schema.getGroups()) {
        final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
        for (final Partition partition : partitions) {
            final List<String> inputFiles = new ArrayList<>();
            // New data
            inputFiles.add(directoryForGroupAndPartitionId.apply(group, partition.getPartitionId()));
            // Old data
            inputFiles.add(store.getFile(group, partition));
            final String outputDir = directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId());
            final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, group + "-" + partition.getPartitionId(), false, store.getProperties().getCompressionCodecName(), spark);
            tasks.add(task);
            LOGGER.info("Created AggregateAndSortData task for group {}, partition {}", group, partition.getPartitionId());
        }
    }
    for (final String group : schema.getEdgeGroups()) {
        final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
        for (final Partition partition : partitions) {
            final List<String> inputFiles = new ArrayList<>();
            // New data
            inputFiles.add(directoryForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
            // Old data
            inputFiles.add(store.getFileForReversedEdges(group, partition));
            final String outputDir = directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId());
            final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, "reversed-" + group + "-" + partition.getPartitionId(), true, store.getProperties().getCompressionCodecName(), spark);
            tasks.add(task);
            LOGGER.info("Created AggregateAndSortData task for reversed edge group {}, partition {}", group, partition.getPartitionId());
        }
    }
    try {
        LOGGER.info("Invoking {} AggregateAndSortData tasks", tasks.size());
        final List<Future<CallableResult>> futures = threadPool.invokeAll(tasks);
        for (final Future<CallableResult> future : futures) {
            final CallableResult result = future.get();
            LOGGER.info("Result {} from task", result);
        }
    } catch (final InterruptedException e) {
        throw new OperationException("InterruptedException running AggregateAndSortData tasks", e);
    } catch (final ExecutionException e) {
        throw new OperationException("ExecutionException running AggregateAndSortData tasks", e);
    }
    try {
        // Move results to a new snapshot directory (the -tmp at the end allows us to add data to the directory,
        // and then when this is all finished we rename the directory to remove the -tmp; this allows us to make
        // the replacement of the old data with the new data an atomic operation and ensures that a get operation
        // against the store will not read the directory when only some of the data has been moved there).
        final long snapshot = System.currentTimeMillis();
        final String newDataDir = store.getDataDir() + "/" + ParquetStore.getSnapshotPath(snapshot) + "-tmp";
        LOGGER.info("Moving aggregated and sorted data to new snapshot directory {}", newDataDir);
        fs.mkdirs(new Path(newDataDir));
        for (final String group : schema.getGroups()) {
            final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, false));
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        for (final String group : schema.getEdgeGroups()) {
            final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, true));
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        for (final String group : schema.getGroups()) {
            final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, false);
            final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
            for (final Partition partition : partitions) {
                final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId()));
                if (!fs.exists(outputDir)) {
                    LOGGER.info("Not moving data for group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
                } else {
                    // One .parquet file and one .parquet.crc file
                    final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
                    if (1 != status.length) {
                        LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
                        throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
                    } else {
                        final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
                        LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
                        fs.rename(status[0].getPath(), destination);
                    }
                }
            }
        }
        for (final String group : schema.getEdgeGroups()) {
            final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, true);
            final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
            for (final Partition partition : partitions) {
                final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
                if (!fs.exists(outputDir)) {
                    LOGGER.info("Not moving data for reversed edge group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
                } else {
                    // One .parquet file and one .parquet.crc file
                    final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
                    if (1 != status.length) {
                        LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
                        throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
                    } else {
                        final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
                        LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
                        fs.rename(status[0].getPath(), destination);
                    }
                }
            }
        }
        // Delete temporary data directory
        LOGGER.info("Deleting temporary directory {}", tmpDirectory);
        fs.delete(new Path(tmpDirectory), true);
        // Write out graph partitioner (unchanged from previous one)
        final Path newGraphPartitionerPath = new Path(newDataDir + "/graphPartitioner");
        final FSDataOutputStream stream = fs.create(newGraphPartitionerPath);
        LOGGER.info("Writing graph partitioner to {}", newGraphPartitionerPath);
        new GraphPartitionerSerialiser().write(currentGraphPartitioner, stream);
        stream.close();
        // Move snapshot-tmp directory to snapshot
        final String directoryWithoutTmp = newDataDir.substring(0, newDataDir.lastIndexOf("-tmp"));
        LOGGER.info("Renaming {} to {}", newDataDir, directoryWithoutTmp);
        fs.rename(new Path(newDataDir), new Path(directoryWithoutTmp));
        // Set snapshot on store to new value
        LOGGER.info("Updating latest snapshot on store to {}", snapshot);
        store.setLatestSnapshot(snapshot);
    } catch (final IOException | StoreException e) {
        throw new OperationException("IOException moving results files into new snapshot directory", e);
    }
}
Also used : ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) StoreException(uk.gov.gchq.gaffer.store.StoreException) FileSystem(org.apache.hadoop.fs.FileSystem) AggregateAndSortData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateAndSortData) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) Callable(java.util.concurrent.Callable) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Future(java.util.concurrent.Future) SparkParquetUtils(uk.gov.gchq.gaffer.parquetstore.utils.SparkParquetUtils) Path(org.apache.hadoop.fs.Path) ExecutorService(java.util.concurrent.ExecutorService) SparkSession(org.apache.spark.sql.SparkSession) Logger(org.slf4j.Logger) Partition(uk.gov.gchq.gaffer.parquetstore.partitioner.Partition) SparkContextUtil(uk.gov.gchq.gaffer.spark.SparkContextUtil) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) IOException(java.io.IOException) Option(scala.Option) Executors(java.util.concurrent.Executors) ExecutionException(java.util.concurrent.ExecutionException) Store(uk.gov.gchq.gaffer.store.Store) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Context(uk.gov.gchq.gaffer.store.Context) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) OperationException(uk.gov.gchq.gaffer.operation.OperationException) OperationHandler(uk.gov.gchq.gaffer.store.operation.handler.OperationHandler) CallableResult(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CallableResult) SparkSession(org.apache.spark.sql.SparkSession) FileStatus(org.apache.hadoop.fs.FileStatus) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) AggregateAndSortData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateAndSortData) Callable(java.util.concurrent.Callable) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) ExecutionException(java.util.concurrent.ExecutionException) OperationException(uk.gov.gchq.gaffer.operation.OperationException) Path(org.apache.hadoop.fs.Path) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Partition(uk.gov.gchq.gaffer.parquetstore.partitioner.Partition) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) IOException(java.io.IOException) StoreException(uk.gov.gchq.gaffer.store.StoreException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) CallableResult(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CallableResult)

Aggregations

SchemaUtils (uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils)11 Path (org.apache.hadoop.fs.Path)8 ArrayList (java.util.ArrayList)7 List (java.util.List)6 ParquetStore (uk.gov.gchq.gaffer.parquetstore.ParquetStore)6 IOException (java.io.IOException)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 Test (org.junit.jupiter.api.Test)4 ParquetStoreProperties (uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties)4 GraphPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)4 Schema (uk.gov.gchq.gaffer.store.schema.Schema)4 FileStatus (org.apache.hadoop.fs.FileStatus)3 Logger (org.slf4j.Logger)3 LoggerFactory (org.slf4j.LoggerFactory)3 View (uk.gov.gchq.gaffer.data.elementdefinition.view.View)3 OperationException (uk.gov.gchq.gaffer.operation.OperationException)3 LongVertexOperationsTest (uk.gov.gchq.gaffer.parquetstore.operation.handler.LongVertexOperationsTest)3 CalculatePartitionerTest (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitionerTest)3 StoreException (uk.gov.gchq.gaffer.store.StoreException)3 Arrays (java.util.Arrays)2