Search in sources :

Example 1 with TempDir

use of org.junit.jupiter.api.io.TempDir in project Gaffer by gchq.

the class WriteUnsortedDataTest method testNoSplitPointsCase.

@Test
public void testNoSplitPointsCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, new ArrayList<>()));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, new ArrayList<>()));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, new ArrayList<>()));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, new ArrayList<>()));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, new ArrayList<>()));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, new ArrayList<>()));
    final List<Element> elements = getData(3L);
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - Each directory should exist and contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    // - Each file should contain the data that was written to it, in the order it was in the iterable
    testContainsCorrectDataNoSplitPoints(TestGroups.ENTITY, tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.ENTITY_2, tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE, tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE_2, tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", elements, schemaUtils);
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE, tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", elements, schemaUtils);
    final List<Element> elementsWithSameSrcDstRemoved = elements.stream().filter(e -> e.getGroup().equals(TestGroups.EDGE_2)).map(e -> (Edge) e).filter(e -> !e.getSource().equals(e.getDestination())).collect(Collectors.toList());
    testContainsCorrectDataNoSplitPoints(TestGroups.EDGE_2, tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", elementsWithSameSrcDstRemoved, schemaUtils);
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) Test(org.junit.jupiter.api.Test)

Example 2 with TempDir

use of org.junit.jupiter.api.io.TempDir in project Gaffer by gchq.

the class WriteDataTest method testTwoWritesToSamePartitionDoesntThrowException.

@Test
public void testTwoWritesToSamePartitionDoesntThrowException(@TempDir java.nio.file.Path tempDir) throws Exception {
    // Given
    final Schema schema = new Schema.Builder().type("int", new TypeDefinition.Builder().clazz(Integer.class).serialiser(new IntegerParquetSerialiser()).build()).type("string", new TypeDefinition.Builder().clazz(String.class).serialiser(new StringParquetSerialiser()).build()).entity("entity", new SchemaEntityDefinition.Builder().vertex("string").property("property1", "int").aggregate(false).build()).edge("edge", new SchemaEdgeDefinition.Builder().source("string").destination("string").property("property2", "int").aggregate(false).build()).vertexSerialiser(new StringParquetSerialiser()).build();
    final Function<String, String> groupToDirectory = group -> tempDir.toAbsolutePath().toString() + "/" + group;
    final List<Element> elements = new ArrayList<>();
    elements.add(new Entity.Builder().group("entity").vertex("A").property("property1", 1).build());
    elements.add(new Edge.Builder().group("edge").source("B").dest("C").property("property2", 100).build());
    final WriteData writeData = new WriteData(groupToDirectory, schema, CompressionCodecName.GZIP);
    final FileSystem fileSystem = FileSystem.get(new Configuration());
    // When
    final ExecutorService executorService = Executors.newFixedThreadPool(3);
    final List<Callable<Void>> tasks = new ArrayList<>();
    LongStream.range(1000L, 1003L).forEach(l -> {
        tasks.add(() -> {
            writeData.call(elements.iterator(), 1, l);
            return null;
        });
    });
    executorService.invokeAll(tasks);
    // Then
    // - Check that a file named with the partition id has been created
    assertTrue(fileSystem.exists(new Path(groupToDirectory.apply("entity") + "/" + "input-1.parquet")));
    assertTrue(fileSystem.exists(new Path(groupToDirectory.apply("edge") + "/" + "input-1.parquet")));
}
Also used : TypeDefinition(uk.gov.gchq.gaffer.store.schema.TypeDefinition) LongStream(java.util.stream.LongStream) FileSystem(org.apache.hadoop.fs.FileSystem) StringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser) SchemaEdgeDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEdgeDefinition) Callable(java.util.concurrent.Callable) Entity(uk.gov.gchq.gaffer.data.element.Entity) Function(java.util.function.Function) Element(uk.gov.gchq.gaffer.data.element.Element) Executors(java.util.concurrent.Executors) ArrayList(java.util.ArrayList) Test(org.junit.jupiter.api.Test) IntegerParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.IntegerParquetSerialiser) List(java.util.List) Schema(uk.gov.gchq.gaffer.store.schema.Schema) SchemaEntityDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEntityDefinition) TempDir(org.junit.jupiter.api.io.TempDir) Configuration(org.apache.hadoop.conf.Configuration) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Path(org.apache.hadoop.fs.Path) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Edge(uk.gov.gchq.gaffer.data.element.Edge) ExecutorService(java.util.concurrent.ExecutorService) Path(org.apache.hadoop.fs.Path) StringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser) Entity(uk.gov.gchq.gaffer.data.element.Entity) Configuration(org.apache.hadoop.conf.Configuration) Schema(uk.gov.gchq.gaffer.store.schema.Schema) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) TypeDefinition(uk.gov.gchq.gaffer.store.schema.TypeDefinition) FileSystem(org.apache.hadoop.fs.FileSystem) ExecutorService(java.util.concurrent.ExecutorService) IntegerParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.IntegerParquetSerialiser) Test(org.junit.jupiter.api.Test)

Example 3 with TempDir

use of org.junit.jupiter.api.io.TempDir in project flink by apache.

the class TaskExecutorRecoveryTest method testRecoveredTaskExecutorWillRestoreAllocationState.

@Test
public void testRecoveredTaskExecutorWillRestoreAllocationState(@TempDir File tempDir) throws Exception {
    final ResourceID resourceId = ResourceID.generate();
    final Configuration configuration = new Configuration();
    configuration.set(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    configuration.set(CheckpointingOptions.LOCAL_RECOVERY, true);
    final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
    final ArrayBlockingQueue<TaskExecutorSlotReport> queue = new ArrayBlockingQueue<>(2);
    testingResourceManagerGateway.setSendSlotReportFunction(slotReportInformation -> {
        queue.offer(TaskExecutorSlotReport.create(slotReportInformation.f0, slotReportInformation.f2));
        return CompletableFuture.completedFuture(Acknowledge.get());
    });
    final TestingRpcService rpcService = rpcServiceExtension.getTestingRpcService();
    rpcService.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
    final JobID jobId = new JobID();
    final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    highAvailabilityServices.setResourceManagerLeaderRetriever(new SettableLeaderRetrievalService(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID()));
    final SettableLeaderRetrievalService jobMasterLeaderRetriever = new SettableLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
    final WorkingDirectory workingDirectory = WorkingDirectory.create(tempDir);
    final TaskExecutor taskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
    taskExecutor.start();
    final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
    final TaskExecutorSlotReport taskExecutorSlotReport = queue.take();
    final SlotReport slotReport = taskExecutorSlotReport.getSlotReport();
    assertThat(slotReport.getNumSlotStatus(), is(2));
    final SlotStatus slotStatus = slotReport.iterator().next();
    final SlotID allocatedSlotID = slotStatus.getSlotID();
    final AllocationID allocationId = new AllocationID();
    taskExecutorGateway.requestSlot(allocatedSlotID, jobId, allocationId, slotStatus.getResourceProfile(), "localhost", testingResourceManagerGateway.getFencingToken(), Time.seconds(10L)).join();
    taskExecutor.close();
    final BlockingQueue<Collection<SlotOffer>> offeredSlots = new ArrayBlockingQueue<>(1);
    final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
        offeredSlots.offer(new HashSet<>(slotOffers));
        return CompletableFuture.completedFuture(slotOffers);
    }).build();
    rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
    jobMasterLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
    // recover the TaskExecutor
    final TaskExecutor recoveredTaskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
    recoveredTaskExecutor.start();
    final TaskExecutorSlotReport recoveredSlotReport = queue.take();
    for (SlotStatus status : recoveredSlotReport.getSlotReport()) {
        if (status.getSlotID().equals(allocatedSlotID)) {
            assertThat(status.getJobID(), is(jobId));
            assertThat(status.getAllocationID(), is(allocationId));
        } else {
            assertThat(status.getJobID(), is(nullValue()));
        }
    }
    final Collection<SlotOffer> take = offeredSlots.take();
    assertThat(take, hasSize(1));
    final SlotOffer offeredSlot = take.iterator().next();
    assertThat(offeredSlot.getAllocationId(), is(allocationId));
}
Also used : TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) CompletableFuture(java.util.concurrent.CompletableFuture) EachCallbackWrapper(org.apache.flink.core.testutils.EachCallbackWrapper) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) HashSet(java.util.HashSet) TestLoggerExtension(org.apache.flink.util.TestLoggerExtension) TaskManagerOptions(org.apache.flink.configuration.TaskManagerOptions) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingRpcServiceExtension(org.apache.flink.runtime.rpc.TestingRpcServiceExtension) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) RegisterExtension(org.junit.jupiter.api.extension.RegisterExtension) Matchers.nullValue(org.hamcrest.Matchers.nullValue) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) BlockingQueue(java.util.concurrent.BlockingQueue) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) File(java.io.File) CheckpointingOptions(org.apache.flink.configuration.CheckpointingOptions) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) Test(org.junit.jupiter.api.Test) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) JobID(org.apache.flink.api.common.JobID) WorkingDirectory(org.apache.flink.runtime.entrypoint.WorkingDirectory) TempDir(org.junit.jupiter.api.io.TempDir) Matchers.is(org.hamcrest.Matchers.is) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) Configuration(org.apache.flink.configuration.Configuration) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) HashSet(java.util.HashSet) WorkingDirectory(org.apache.flink.runtime.entrypoint.WorkingDirectory) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) Collection(java.util.Collection) JobID(org.apache.flink.api.common.JobID) Test(org.junit.jupiter.api.Test)

Example 4 with TempDir

use of org.junit.jupiter.api.io.TempDir in project Gaffer by gchq.

the class WriteUnsortedDataTest method testMultipleSplitPointsCase.

@Test
public void testMultipleSplitPointsCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split points are 10L and 100L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 11L
    // 12L
    // 100L
    // 100L
    // 200L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    splitPointsEntity.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(11L));
    elements.add(createEntityForEntityGroup(12L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(100L));
    elements.add(createEntityForEntityGroup(200L));
    // TestGroups.ENTITY_2, split points are 100L and 1000L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 200L
    // 1000L
    // 5000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 1000L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(200L));
    elements.add(createEntityForEntityGroup_2(1000L));
    elements.add(createEntityForEntityGroup_2(5000L));
    // TestGroups.EDGE, split points are [1000L, 200L, true] and [1000L, 30000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L        5000L         true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L       10000L         false
    // 1000L       30000L         false
    // 1000L      300000L         true
    // 10000L         400L         false
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 30000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 100L, 1000L, true }));
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 300L, 2000L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 90L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 10000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 30000L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 300000L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    // TestGroups.EDGE_2, split points are [10L, 2000L, true] and [100L, 1000L, false]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        false
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 100L, 1000L, false }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 1000L, 1500L, true }));
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 2000L, 2500L, false }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, false));
    elements.add(createEdgeForEdgeGroup_2(10L, 2000L, true));
    elements.add(createEdgeForEdgeGroup_2(10L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 1000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, false));
    elements.add(createEdgeForEdgeGroup_2(100L, 3000L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0, split1 and split2 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-2", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-2", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-2", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(1), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-2", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(1), null, elements, schemaUtils);
    }
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Example 5 with TempDir

use of org.junit.jupiter.api.io.TempDir in project Gaffer by gchq.

the class WriteUnsortedDataTest method testOneSplitPointCase.

@Test
public void testOneSplitPointCase(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException {
    // Given
    final String tempFilesDir = tempDir.toAbsolutePath().toString();
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    final List<Element> elements = new ArrayList<>();
    // TestGroups.ENTITY, split point is 10L. Create data with
    // VERTEX
    // 5L
    // 10L
    // 10L
    // 10L
    // 20L
    final List<PartitionKey> splitPointsEntity = new ArrayList<>();
    splitPointsEntity.add(new PartitionKey(new Object[] { 10L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY, new GroupPartitioner(TestGroups.ENTITY, splitPointsEntity));
    elements.add(createEntityForEntityGroup(5L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(10L));
    elements.add(createEntityForEntityGroup(20L));
    // TestGroups.ENTITY_2, split point is 100L. Create data with
    // VERTEX
    // 5L
    // 100L
    // 1000L
    final List<PartitionKey> splitPointsEntity_2 = new ArrayList<>();
    splitPointsEntity_2.add(new PartitionKey(new Object[] { 100L }));
    graphPartitioner.addGroupPartitioner(TestGroups.ENTITY_2, new GroupPartitioner(TestGroups.ENTITY_2, splitPointsEntity_2));
    elements.add(createEntityForEntityGroup_2(5L));
    elements.add(createEntityForEntityGroup_2(100L));
    elements.add(createEntityForEntityGroup_2(1000L));
    // TestGroups.EDGE, split point is [1000L, 200L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 5L         200L         false
    // 1000L         100L         true
    // 1000L         200L         false
    // 1000L         200L         true
    // 1000L         300L         true
    // 10000L         400L         false
    // 10000L         400L         true
    final List<PartitionKey> splitPointsEdge = new ArrayList<>();
    splitPointsEdge.add(new PartitionKey(new Object[] { 1000L, 200L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsEdge));
    final List<PartitionKey> splitPointsReversedEdge = new ArrayList<>();
    splitPointsReversedEdge.add(new PartitionKey(new Object[] { 1000L, 300L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE, new GroupPartitioner(TestGroups.EDGE, splitPointsReversedEdge));
    elements.add(createEdgeForEdgeGroup(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup(10000L, 400L, true));
    // TestGroups.EDGE_2, split point is [10L, 2000L, true]. Create data with
    // SOURCE   DESTINATION    DIRECTED
    // 5L         5000L        true
    // 10L         2000L        false
    // 10L         2000L        true
    // 10L         3000L        false
    // 100L         1000L        true
    // 100L         3000L        false
    // 100L         3000L        true
    final List<PartitionKey> splitPointsEdge_2 = new ArrayList<>();
    splitPointsEdge_2.add(new PartitionKey(new Object[] { 10L, 2000L, true }));
    graphPartitioner.addGroupPartitioner(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsEdge_2));
    final List<PartitionKey> splitPointsReversedEdge_2 = new ArrayList<>();
    splitPointsReversedEdge_2.add(new PartitionKey(new Object[] { 3000L, 20L, true }));
    graphPartitioner.addGroupPartitionerForReversedEdges(TestGroups.EDGE_2, new GroupPartitioner(TestGroups.EDGE_2, splitPointsReversedEdge_2));
    elements.add(createEdgeForEdgeGroup_2(5L, 5000L, true));
    elements.add(createEdgeForEdgeGroup_2(5L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 100L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, false));
    elements.add(createEdgeForEdgeGroup_2(1000L, 200L, true));
    elements.add(createEdgeForEdgeGroup_2(1000L, 300L, true));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, false));
    elements.add(createEdgeForEdgeGroup_2(10000L, 400L, true));
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionId = (group, partitionId) -> tempFilesDir + "/GROUP=" + group + "/split-" + partitionId;
    final BiFunction<String, Integer, String> fileNameForGroupAndPartitionIdForReversedEdge = (group, partitionId) -> tempFilesDir + "/REVERSED-GROUP=" + group + "/split-" + partitionId;
    final WriteUnsortedData writeUnsortedData = new WriteUnsortedData(tempFilesDir, CompressionCodecName.GZIP, schemaUtils, graphPartitioner, fileNameForGroupAndPartitionId, fileNameForGroupAndPartitionIdForReversedEdge);
    // When
    writeUnsortedData.writeElements(elements);
    // Then
    // - For each group, directories split0 and split1 should exist and each contain one file
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.ENTITY_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE + "/split-1", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-0", 1);
    testExistsAndContainsNFiles(tempFilesDir + "/REVERSED-GROUP=" + TestGroups.EDGE_2 + "/split-1", 1);
    // - Each split file should contain the data for that split in the order it was written
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.ENTITY, TestGroups.ENTITY_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, true, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, true, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
    for (final String group : new HashSet<>(Arrays.asList(TestGroups.EDGE, TestGroups.EDGE_2))) {
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-0", group, false, false, null, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-0", group, false, true, null, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/GROUP=" + group + "/split-1", group, false, false, graphPartitioner.getGroupPartitioner(group).getIthPartitionKey(0), null, elements, schemaUtils);
        testSplitFileContainsCorrectData(tempFilesDir + "/REVERSED-GROUP=" + group + "/split-1", group, false, true, graphPartitioner.getGroupPartitionerForReversedEdges(group).getIthPartitionKey(0), null, elements, schemaUtils);
    }
}
Also used : GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Date(java.util.Date) BiFunction(java.util.function.BiFunction) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) SimpleDateFormat(java.text.SimpleDateFormat) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) TestGroups(uk.gov.gchq.gaffer.commonutil.TestGroups) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) TimeZone(java.util.TimeZone) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) Entity(uk.gov.gchq.gaffer.data.element.Entity) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) DataGen(uk.gov.gchq.gaffer.parquetstore.testutils.DataGen) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) TempDir(org.junit.jupiter.api.io.TempDir) TestUtils(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) OperationException(uk.gov.gchq.gaffer.operation.OperationException) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) HashSet(java.util.HashSet) Test(org.junit.jupiter.api.Test)

Aggregations

Test (org.junit.jupiter.api.Test)6 TempDir (org.junit.jupiter.api.io.TempDir)6 IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 HashSet (java.util.HashSet)4 List (java.util.List)4 Configuration (org.apache.hadoop.conf.Configuration)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)4 Assertions.assertTrue (org.junit.jupiter.api.Assertions.assertTrue)4 Edge (uk.gov.gchq.gaffer.data.element.Edge)4 Element (uk.gov.gchq.gaffer.data.element.Element)4 Entity (uk.gov.gchq.gaffer.data.element.Entity)4 ParseException (java.text.ParseException)3 SimpleDateFormat (java.text.SimpleDateFormat)3 Arrays (java.util.Arrays)3 Date (java.util.Date)3 Map (java.util.Map)3 TimeZone (java.util.TimeZone)3