Examples with WriteUnsortedData - uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData

Example 1 with WriteUnsortedData

use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData in project Gaffer by gchq.

the class WriteUnsortedDataTest method testNoSplitPointsCase.

@Test
public void testNoSplitPointsCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, new ArrayList<>()));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, new ArrayList<>()));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, new ArrayList<>()));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, new ArrayList<>()));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, new ArrayList<>()));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, new ArrayList<>()));
    final List<Element> elements = getData(3L);
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - Each directory should exist and contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    // - Each file should contain the data that was written to it, in the order it was in the iterable
    testContainsCorrectDataNoSplitPoints(TestGroups.ENTITY, tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.ENTITY_2, tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE, tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE_2, tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE, tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", elements, schemaUtils);
    final List<Element> elementsWithSameSrcDstRemoved = elements.stream().filter(e -> e.getGroup().equals(TestGroups.EDGE_2)).map(e -> (Edge) e).filter(e -> !e.getSource().equals(e.getDestination())).collect(Collectors.toList());
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE_2, tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", elementsWithSameSrcDstRemoved, schemaUtils);
}

Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) Test(org.junit.jupiter.api.Test)

Example 2 with WriteUnsortedData

use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData in project Gaffer by gchq.

the class AddElementsHandler method addElements.

private void addElements(final AddElements addElementsOperation, final Context context, final ParquetStore store) throws OperationException {
    // Set up
    final FileSystem fs = store.getFS();
    final Schema schema = store.getSchema();
    final SchemaUtils schemaUtils = store.getSchemaUtils();
    final SparkSession spark = SparkContextUtil.getSparkSession(context, store.getProperties());
    final ExecutorService threadPool = createThreadPool(spark, store.getProperties());
    final GraphPartitioner currentGraphPartitioner = store.getGraphPartitioner();
    SparkParquetUtils.configureSparkForAddElements(spark, store.getProperties());
    // Write data from addElementsOperation split by group and partition (NB this uses the existing partitioner -
    // adding elements using this operation does not effect the partitions).
    final String tmpDirectory = store.getTempFilesDir();
    final BiFunction<String, Integer, String> directoryForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/group=" + group + "/partition=" + partitionId;
    final BiFunction<String, Integer, String> directoryForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/reversed-group=" + group + "/partition=" + partitionId;
    LOGGER.info("Calling WriteUnsortedData to add elements");
    LOGGER.trace("currentGraphPartitioner is {}", currentGraphPartitioner);
    new WriteUnsortedData(store, currentGraphPartitioner, directoryForGroupAndPartitionId, directoryForGroupAndPartitionIdForReversedEdges).writeElements(addElementsOperation.getInput());
    // For every group and partition, aggregate the new data with the old data and then sort
    final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/group=" + group + "/partition=" + partitionId;
    final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/REVERSED-group=" + group + "/partition=" + partitionId;
    final List<Callable<CallableResult>> tasks = new ArrayList<>();
    for (final String group : schema.getGroups()) {
        final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
        for (final Partition partition : partitions) {
            final List<String> inputFiles = new ArrayList<>();
            // New data
            inputFiles.add(directoryForGroupAndPartitionId.apply(group, partition.getPartitionId()));
            // Old data
            inputFiles.add(store.getFile(group, partition));
            final String outputDir = directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId());
            final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, group + "-" + partition.getPartitionId(), false, store.getProperties().getCompressionCodecName(), spark);
            tasks.add(task);
            LOGGER.info("Created AggregateAndSortData task for group {}, partition {}", group, partition.getPartitionId());
        }
    }
    for (final String group : schema.getEdgeGroups()) {
        final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
        for (final Partition partition : partitions) {
            final List<String> inputFiles = new ArrayList<>();
            // New data
            inputFiles.add(directoryForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
            // Old data
            inputFiles.add(store.getFileForReversedEdges(group, partition));
            final String outputDir = directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId());
            final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, "reversed-" + group + "-" + partition.getPartitionId(), true, store.getProperties().getCompressionCodecName(), spark);
            tasks.add(task);
            LOGGER.info("Created AggregateAndSortData task for reversed edge group {}, partition {}", group, partition.getPartitionId());
        }
    }
    try {
        LOGGER.info("Invoking {} AggregateAndSortData tasks", tasks.size());
        final List<Future<CallableResult>> futures = threadPool.invokeAll(tasks);
        for (final Future<CallableResult> future : futures) {
            final CallableResult result = future.get();
            LOGGER.info("Result {} from task", result);
        }
    } catch (final InterruptedException e) {
        throw new OperationException("InterruptedException running AggregateAndSortData tasks", e);
    } catch (final ExecutionException e) {
        throw new OperationException("ExecutionException running AggregateAndSortData tasks", e);
    }
    try {
        // Move results to a new snapshot directory (the -tmp at the end allows us to add data to the directory,
        // and then when this is all finished we rename the directory to remove the -tmp; this allows us to make
        // the replacement of the old data with the new data an atomic operation and ensures that a get operation
        // against the store will not read the directory when only some of the data has been moved there).
        final long snapshot = System.currentTimeMillis();
        final String newDataDir = store.getDataDir() + "/" + ParquetStore.getSnapshotPath(snapshot) + "-tmp";
        LOGGER.info("Moving aggregated and sorted data to new snapshot directory {}", newDataDir);
        fs.mkdirs(new Path(newDataDir));
        for (final String group : schema.getGroups()) {
            final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, false));
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        for (final String group : schema.getEdgeGroups()) {
            final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, true));
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        for (final String group : schema.getGroups()) {
            final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, false);
            final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
            for (final Partition partition : partitions) {
                final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId()));
                if (!fs.exists(outputDir)) {
                    LOGGER.info("Not moving data for group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
                } else {
                    // One .parquet file and one .parquet.crc file
                    final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
                    if (1 != status.length) {
                        LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
                        throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
                    } else {
                        final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
                        LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
                        fs.rename(status[0].getPath(), destination);
                    }
                }
            }
        }
        for (final String group : schema.getEdgeGroups()) {
            final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, true);
            final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
            for (final Partition partition : partitions) {
                final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
                if (!fs.exists(outputDir)) {
                    LOGGER.info("Not moving data for reversed edge group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
                } else {
                    // One .parquet file and one .parquet.crc file
                    final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
                    if (1 != status.length) {
                        LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
                        throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
                    } else {
                        final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
                        LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
                        fs.rename(status[0].getPath(), destination);
                    }
                }
            }
        }
        // Delete temporary data directory
        LOGGER.info("Deleting temporary directory {}", tmpDirectory);
        fs.delete(new Path(tmpDirectory), true);
        // Write out graph partitioner (unchanged from previous one)
        final Path newGraphPartitionerPath = new Path(newDataDir + "/graphPartitioner");
        final FSDataOutputStream stream = fs.create(newGraphPartitionerPath);
        LOGGER.info("Writing graph partitioner to {}", newGraphPartitionerPath);
        new GraphPartitionerSerialiser().write(currentGraphPartitioner, stream);
        stream.close();
        // Move snapshot-tmp directory to snapshot
        final String directoryWithoutTmp = newDataDir.substring(0, newDataDir.lastIndexOf("-tmp"));
        LOGGER.info("Renaming {} to {}", newDataDir, directoryWithoutTmp);
        fs.rename(new Path(newDataDir), new Path(directoryWithoutTmp));
        // Set snapshot on store to new value
        LOGGER.info("Updating latest snapshot on store to {}", snapshot);
        store.setLatestSnapshot(snapshot);
    } catch (final IOException | StoreException e) {
        throw new OperationException("IOException moving results files into new snapshot directory", e);
    }
}

Also used : ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) StoreException(uk.gov.gchq.gaffer.store.StoreException) FileSystem(org.apache.hadoop.fs.FileSystem) AggregateAndSortData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateAndSortData) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) Callable(java.util.concurrent.Callable) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Future(java.util.concurrent.Future) SparkParquetUtils(uk.gov.gchq.gaffer.parquetstore.utils.SparkParquetUtils) Path(org.apache.hadoop.fs.Path) ExecutorService(java.util.concurrent.ExecutorService) SparkSession(org.apache.spark.sql.SparkSession) Logger(org.slf4j.Logger) Partition(uk.gov.gchq.gaffer.parquetstore.partitioner.Partition) SparkContextUtil(uk.gov.gchq.gaffer.spark.SparkContextUtil) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) IOException(java.io.IOException) Option(scala.Option) Executors(java.util.concurrent.Executors) ExecutionException(java.util.concurrent.ExecutionException) Store(uk.gov.gchq.gaffer.store.Store) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Context(uk.gov.gchq.gaffer.store.Context) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) OperationException(uk.gov.gchq.gaffer.operation.OperationException) OperationHandler(uk.gov.gchq.gaffer.store.operation.handler.OperationHandler) CallableResult(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CallableResult) SparkSession(org.apache.spark.sql.SparkSession) FileStatus(org.apache.hadoop.fs.FileStatus) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) AggregateAndSortData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateAndSortData) Callable(java.util.concurrent.Callable) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) ExecutionException(java.util.concurrent.ExecutionException) OperationException(uk.gov.gchq.gaffer.operation.OperationException) Path(org.apache.hadoop.fs.Path) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Partition(uk.gov.gchq.gaffer.parquetstore.partitioner.Partition) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) IOException(java.io.IOException) StoreException(uk.gov.gchq.gaffer.store.StoreException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) CallableResult(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CallableResult)

Example 3 with WriteUnsortedData

use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData in project Gaffer by gchq.

the class WriteUnsortedDataTest method testMultipleSplitPointsCase.

@Test
public void testMultipleSplitPointsCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split points are 10L and 100L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 11L
    // 12L
    // 100L
    // 100L
    // 200L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    splitPointsEntity.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(11L));
    elements.add(createEntityForEntityGroup(12L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(200L));
    // TestGroups.ENTITY_2, split points are 100L and 1000L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 200L
    // 1000L
    // 5000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 1000L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(200L));
    elements.add(createEntityForEntityGroup_2(1000L));
    elements.add(createEntityForEntityGroup_2(5000L));
    // TestGroups.EDGE, split points are [1000L, 200L, true] and [1000L, 30000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L        5000L         true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L       10000L         false
    // 1000L       30000L         false
    // 1000L      300000L         true
    // 10000L         400L         false
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 30000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 100L, 1000L, true }));
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 300L, 2000L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 90L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 10000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 30000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 300000L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    // TestGroups.EDGE_2, split points are [10L, 2000L, true] and [100L, 1000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        false
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 100L, 1000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 1000L, 1500L, true }));
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 2000L, 2500L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, false));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 1000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0, split1 and split2 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-2", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-2", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
}

Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 4 with WriteUnsortedData

use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData in project Gaffer by gchq.

the class WriteUnsortedDataTest method testOneSplitPointCase.

@Test
public void testOneSplitPointCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split point is 10L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 10L
    // 20L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(20L));
    // TestGroups.ENTITY_2, split point is 100L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 1000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(1000L));
    // TestGroups.EDGE, split point is [1000L, 200L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L         200L         false
    // 1000L         200L         true
    // 1000L         300L         true
    // 10000L         400L         false
    // 10000L         400L         true
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 1000L, 300L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, true));
    // TestGroups.EDGE_2, split point is [10L, 2000L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        true
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 3000L, 20L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0 and split1 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
}

Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Aggregations

IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 List (java.util.List)4 BiFunction (java.util.function.BiFunction)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 OperationException (uk.gov.gchq.gaffer.operation.OperationException)4 WriteUnsortedData (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData)4 GraphPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)4 ParseException (java.text.ParseException)3 SimpleDateFormat (java.text.SimpleDateFormat)3 Arrays (java.util.Arrays)3 Date (java.util.Date)3 HashSet (java.util.HashSet)3 Map (java.util.Map)3 TimeZone (java.util.TimeZone)3 Collectors (java.util.stream.Collectors)3 Configuration (org.apache.hadoop.conf.Configuration)3 ParquetReader (org.apache.parquet.hadoop.ParquetReader)3