Examples with ParquetStore - uk.gov.gchq.gaffer.parquetstore.ParquetStore

Example 6 with ParquetStore

use of uk.gov.gchq.gaffer.parquetstore.ParquetStore in project Gaffer by gchq.

the class AddElementsHandlerTest method testRepeatedCallsOfAddElementsHandler.

@Test
public void testRepeatedCallsOfAddElementsHandler(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException, StoreException {
    // Given
    final List<Element> elementsToAdd = new ArrayList<>();
    // - Data for TestGroups.ENTITY
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    // - Data for TestGroups.ENTITY_2
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
    // - Data for TestGroups.EDGE
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)));
    // - Data for TestGroups.EDGE_2
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false));
    // - Shuffle the list so that the order is random
    Collections.shuffle(elementsToAdd);
    final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
    final Context context = new Context();
    final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
    final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
    final String testDir = tempDir.toString();
    storeProperties.setDataDir(testDir + "/data");
    storeProperties.setTempFilesDir(testDir + "/tmpdata");
    final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
    final FileSystem fs = FileSystem.get(new Configuration());
    final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
    // When1 - Add elementsToAdd twice
    new AddElementsHandler().doOperation(add, context, store);
    new AddElementsHandler().doOperation(add, context, store);
    // Then1
    // - New snapshot directory should have been created.
    long snapshotId = store.getLatestSnapshot();
    Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
    assertTrue(fs.exists(snapshotPath));
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex and date.
    Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(40);
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(i % 2 == 0 ? 16f : 12f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * 2 * (i / 2), (long) results[i].getAs("long"));
        assertEquals(i % 2 == 0 ? 28 : 24, (int) results[i].getAs("short"));
        assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
        assertEquals(4, (int) results[i].getAs("count"));
        assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap1 = new FreqMap();
        mergedFreqMap1.put("A", 4L);
        mergedFreqMap1.put("B", 4L);
        final FreqMap mergedFreqMap2 = new FreqMap();
        mergedFreqMap2.put("A", 4L);
        mergedFreqMap2.put("C", 4L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
    }
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex.
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[1]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[2]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[3]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[4]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[5]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[6]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[7]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
    // directory and in the "reversed-group=BasicEdge" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed, date
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[5]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[5]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
    // directory and in the "reversed-group=BasicEdge2" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
    // When2 - Add some elements from only TestGroups.ENTITY (this tests that groups that are unchanged after
    // an AddElements operation are correctly copied through to the new snapshot).
    elementsToAdd.clear();
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
    new AddElementsHandler().doOperation(add, context, store);
    // Then1
    // - New snapshot directory should have been created.
    snapshotId = store.getLatestSnapshot();
    snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
    assertTrue(fs.exists(snapshotPath));
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex and date.
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(40);
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(i % 2 == 0 ? 16f : 12f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * 2 * (i / 2), (long) results[i].getAs("long"));
        assertEquals(i % 2 == 0 ? 28 : 24, (int) results[i].getAs("short"));
        assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
        assertEquals(4, (int) results[i].getAs("count"));
        assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap1 = new FreqMap();
        mergedFreqMap1.put("A", 4L);
        mergedFreqMap1.put("B", 4L);
        final FreqMap mergedFreqMap2 = new FreqMap();
        mergedFreqMap2.put("A", 4L);
        mergedFreqMap2.put("C", 4L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
    }
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex.
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(12);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[1]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[2]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[3]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[4]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[5]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[6]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[7]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[8]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[9]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[10]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[11]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
    // directory and in the "reversed-group=BasicEdge" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed, date
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[5]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[5]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
    // directory and in the "reversed-group=BasicEdge2" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
    // directory and in the "reversed-group=BasicEdge2" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
}

Also used : AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) Context(uk.gov.gchq.gaffer.store.Context) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) Configuration(org.apache.hadoop.conf.Configuration) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Element(uk.gov.gchq.gaffer.data.element.Element) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) Date(java.util.Date) WrappedArray(scala.collection.mutable.WrappedArray) ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) WriteUnsortedDataTest(uk.gov.gchq.gaffer.parquetstore.utils.WriteUnsortedDataTest) Test(org.junit.jupiter.api.Test) AggregateAndSortDataTest(uk.gov.gchq.gaffer.parquetstore.utils.AggregateAndSortDataTest)

Example 7 with ParquetStore

use of uk.gov.gchq.gaffer.parquetstore.ParquetStore in project Gaffer by gchq.

the class AddElementsHandlerTest method testOnePartitionOneGroup.

@Test
public void testOnePartitionOneGroup(@TempDir java.nio.file.Path tempDir) throws OperationException, IOException, StoreException {
    // Given
    final List<Element> elementsToAdd = new ArrayList<>();
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
    final Context context = new Context();
    final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
    final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
    final String testDir = tempDir.toString();
    storeProperties.setDataDir(testDir + "/data");
    storeProperties.setTempFilesDir(testDir + "/tmpdata");
    final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
    final FileSystem fs = FileSystem.get(new Configuration());
    final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
    // When
    new AddElementsHandler().doOperation(add, context, store);
    // Then
    // - New snapshot directory should have been created.
    final long snapshotId = store.getLatestSnapshot();
    final Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
    assertTrue(fs.exists(snapshotPath));
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex and date.
    final Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(40);
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
        assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
        assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
        assertEquals(2, (int) results[i].getAs("count"));
        assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap1 = new FreqMap();
        mergedFreqMap1.put("A", 2L);
        mergedFreqMap1.put("B", 2L);
        final FreqMap mergedFreqMap2 = new FreqMap();
        mergedFreqMap2.put("A", 2L);
        mergedFreqMap2.put("C", 2L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
    }
}

Also used : AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) Context(uk.gov.gchq.gaffer.store.Context) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) Configuration(org.apache.hadoop.conf.Configuration) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Element(uk.gov.gchq.gaffer.data.element.Element) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) WrappedArray(scala.collection.mutable.WrappedArray) ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) WriteUnsortedDataTest(uk.gov.gchq.gaffer.parquetstore.utils.WriteUnsortedDataTest) Test(org.junit.jupiter.api.Test) AggregateAndSortDataTest(uk.gov.gchq.gaffer.parquetstore.utils.AggregateAndSortDataTest)

Aggregations

ArrayList (java.util.ArrayList)7 Path (org.apache.hadoop.fs.Path)7 ParquetStore (uk.gov.gchq.gaffer.parquetstore.ParquetStore)7 ParquetStoreProperties (uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties)7 Test (org.junit.jupiter.api.Test)6 List (java.util.List)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 SparkSession (org.apache.spark.sql.SparkSession)4 AddElements (uk.gov.gchq.gaffer.operation.impl.add.AddElements)4 SchemaUtils (uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils)4 Context (uk.gov.gchq.gaffer.store.Context)4 Schema (uk.gov.gchq.gaffer.store.schema.Schema)4 Configuration (org.apache.hadoop.conf.Configuration)3 Row (org.apache.spark.sql.Row)3 WrappedArray (scala.collection.mutable.WrappedArray)3 Element (uk.gov.gchq.gaffer.data.element.Element)3 LongVertexOperationsTest (uk.gov.gchq.gaffer.parquetstore.operation.handler.LongVertexOperationsTest)3 CalculatePartitionerTest (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitionerTest)3 Date (java.util.Date)2 FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)2