Search in sources :

Example 46 with FreqMap

use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.

the class IterableToFreqMapTest method shouldInitialiseTheValueOfTheKeyToOneIfNotSeenBefore.

@Test
public void shouldInitialiseTheValueOfTheKeyToOneIfNotSeenBefore() {
    // Given
    Iterable<String> strings = (Iterable<String>) Arrays.asList("one");
    final IterableToFreqMap iterableToFreqMap = new IterableToFreqMap();
    // When
    FreqMap result = iterableToFreqMap.apply(strings);
    // Then
    FreqMap expected = new FreqMap("one");
    assertEquals(expected, result);
}
Also used : FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Test(org.junit.jupiter.api.Test) FunctionTest(uk.gov.gchq.koryphe.function.FunctionTest)

Example 47 with FreqMap

use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.

the class DataGen method generateEdgeRow.

public static GenericRowWithSchema generateEdgeRow(final SchemaUtils utils, final String group, final String src, final String dst, final Boolean directed, final Byte aByte, final Double aDouble, final Float aFloat, final TreeSet<String> treeSet, final Long aLong, final Short aShort, final Date date, final FreqMap freqMap, final String visibility) throws SerialisationException {
    final GafferGroupObjectConverter edgeConverter = new GafferGroupObjectConverter(group, utils.getCoreProperties(group), utils.getCorePropertiesForReversedEdges(), utils.getColumnToSerialiser(group), utils.getSerialisers(), utils.getColumnToPaths(group));
    final List<Object> list = new ArrayList<>();
    final scala.collection.mutable.Map<String, Long> map = new scala.collection.mutable.HashMap<>();
    for (final Map.Entry<String, Long> entry : freqMap.entrySet()) {
        map.put(entry.getKey(), entry.getValue());
    }
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.SOURCE, src)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.DESTINATION, dst)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(ParquetStore.DIRECTED, directed)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("byte", aByte)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("double", aDouble)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("float", aFloat)));
    list.add(WrappedArray$.MODULE$.make(edgeConverter.gafferObjectToParquetObjects("treeSet", treeSet)[0]));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("long", aLong)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("short", aShort)));
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("date", date)));
    list.add(map);
    list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects("count", 1)));
    if (null != visibility) {
        list.addAll(Arrays.asList(edgeConverter.gafferObjectToParquetObjects(TestTypes.VISIBILITY, visibility)));
    }
    final Object[] objects = new Object[list.size()];
    list.toArray(objects);
    return new GenericRowWithSchema(objects, utils.getSparkSchema(group));
}
Also used : ArrayList(java.util.ArrayList) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Map(java.util.Map)

Example 48 with FreqMap

use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.

the class AggregateAndSortDataTest method test.

@Test
public void test(@TempDir java.nio.file.Path tempDir) throws Exception {
    // Given
    final FileSystem fs = FileSystem.get(new Configuration());
    final SchemaUtils schemaUtils = new SchemaUtils(TestUtils.gafferSchema("schemaUsingLongVertexType"));
    final String file1 = tempDir.resolve("inputdata1.parquet").toString();
    final String file2 = tempDir.resolve("inputdata2.parquet").toString();
    writeData(file1, schemaUtils);
    writeData(file2, schemaUtils);
    final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
    final List<String> inputFiles = new ArrayList<>(Sets.newHashSet(file1, file2));
    final String outputFolder = tempDir.resolve("aggregated").toString();
    // When
    new AggregateAndSortData(schemaUtils, fs, inputFiles, outputFolder, TestGroups.ENTITY, "test", false, CompressionCodecName.GZIP, sparkSession).call();
    // Then
    assertTrue(fs.exists(new Path(outputFolder)));
    final Row[] results = (Row[]) sparkSession.read().parquet(outputFolder).collect();
    // Should be sorted by vertex and date
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
        assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
        assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
        assertEquals(2, (int) results[i].getAs("count"));
        assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap1 = new FreqMap();
        mergedFreqMap1.put("A", 2L);
        mergedFreqMap1.put("B", 2L);
        final FreqMap mergedFreqMap2 = new FreqMap();
        mergedFreqMap2.put("A", 2L);
        mergedFreqMap2.put("C", 2L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) Configuration(org.apache.hadoop.conf.Configuration) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) ArrayList(java.util.ArrayList) AggregateAndSortData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateAndSortData) WrappedArray(scala.collection.mutable.WrappedArray) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) Test(org.junit.jupiter.api.Test)

Example 49 with FreqMap

use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.

the class AddElementsHandlerTest method testRepeatedCallsOfAddElementsHandler.

@Test
public void testRepeatedCallsOfAddElementsHandler(@TempDir java.nio.file.Path tempDir) throws IOException, OperationException, StoreException {
    // Given
    final List<Element> elementsToAdd = new ArrayList<>();
    // - Data for TestGroups.ENTITY
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    // - Data for TestGroups.ENTITY_2
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
    // - Data for TestGroups.EDGE
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L)));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L)));
    // - Data for TestGroups.EDGE_2
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true));
    elementsToAdd.add(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false));
    // - Shuffle the list so that the order is random
    Collections.shuffle(elementsToAdd);
    final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
    final Context context = new Context();
    final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
    final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
    final String testDir = tempDir.toString();
    storeProperties.setDataDir(testDir + "/data");
    storeProperties.setTempFilesDir(testDir + "/tmpdata");
    final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
    final FileSystem fs = FileSystem.get(new Configuration());
    final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
    // When1 - Add elementsToAdd twice
    new AddElementsHandler().doOperation(add, context, store);
    new AddElementsHandler().doOperation(add, context, store);
    // Then1
    // - New snapshot directory should have been created.
    long snapshotId = store.getLatestSnapshot();
    Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
    assertTrue(fs.exists(snapshotPath));
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex and date.
    Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(40);
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(i % 2 == 0 ? 16f : 12f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * 2 * (i / 2), (long) results[i].getAs("long"));
        assertEquals(i % 2 == 0 ? 28 : 24, (int) results[i].getAs("short"));
        assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
        assertEquals(4, (int) results[i].getAs("count"));
        assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap1 = new FreqMap();
        mergedFreqMap1.put("A", 4L);
        mergedFreqMap1.put("B", 4L);
        final FreqMap mergedFreqMap2 = new FreqMap();
        mergedFreqMap2.put("A", 4L);
        mergedFreqMap2.put("C", 4L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
    }
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex.
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[1]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[2]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[3]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[4]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[5]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[6]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[7]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
    // directory and in the "reversed-group=BasicEdge" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed, date
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[5]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[5]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
    // directory and in the "reversed-group=BasicEdge2" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
    // When2 - Add some elements from only TestGroups.ENTITY (this tests that groups that are unchanged after
    // an AddElements operation are correctly copied through to the new snapshot).
    elementsToAdd.clear();
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L));
    elementsToAdd.add(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L));
    new AddElementsHandler().doOperation(add, context, store);
    // Then1
    // - New snapshot directory should have been created.
    snapshotId = store.getLatestSnapshot();
    snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
    assertTrue(fs.exists(snapshotPath));
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex and date.
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(40);
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(i % 2 == 0 ? 16f : 12f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * 2 * (i / 2), (long) results[i].getAs("long"));
        assertEquals(i % 2 == 0 ? 28 : 24, (int) results[i].getAs("short"));
        assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
        assertEquals(4, (int) results[i].getAs("count"));
        assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap1 = new FreqMap();
        mergedFreqMap1.put("A", 4L);
        mergedFreqMap1.put("B", 4L);
        final FreqMap mergedFreqMap2 = new FreqMap();
        mergedFreqMap2.put("A", 4L);
        mergedFreqMap2.put("C", 4L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
    }
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity2"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex.
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(12);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[0]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[1]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(1L), results[2]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[3]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[4]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10L), results[5]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[6]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[7]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(100L), results[8]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[9]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[10]);
    checkEntityGroup2(WriteUnsortedDataTest.createEntityForEntityGroup_2(10000L), results[11]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge"
    // directory and in the "reversed-group=BasicEdge" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed, date
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[5]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(6);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 2L, false, new Date(400L), (short) 2), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, false, new Date(400L), (short) 2), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(300L), (short) 2), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(1L, 10L, true, new Date(400L), (short) 2), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(10000L, 1000L, true, new Date(100L), (short) 2), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup(100L, 100000L, false, new Date(200L), (short) 2), results[5]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
    // directory and in the "reversed-group=BasicEdge2" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEdge2"
    // directory and in the "reversed-group=BasicEdge2" directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by source, destination, directed
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[7]);
    results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.EDGE_2, true) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(8);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[0]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10000L, 20L, true), results[1]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[2]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(10L, 50L, true), results[3]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[4]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(100L, 200L, false), results[5]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[6]);
    checkEdge(WriteUnsortedDataTest.createEdgeForEdgeGroup_2(1L, 2000L, false), results[7]);
}
Also used : AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) Context(uk.gov.gchq.gaffer.store.Context) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) Configuration(org.apache.hadoop.conf.Configuration) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Element(uk.gov.gchq.gaffer.data.element.Element) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) Date(java.util.Date) WrappedArray(scala.collection.mutable.WrappedArray) ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) WriteUnsortedDataTest(uk.gov.gchq.gaffer.parquetstore.utils.WriteUnsortedDataTest) Test(org.junit.jupiter.api.Test) AggregateAndSortDataTest(uk.gov.gchq.gaffer.parquetstore.utils.AggregateAndSortDataTest)

Example 50 with FreqMap

use of uk.gov.gchq.gaffer.types.FreqMap in project Gaffer by gchq.

the class AddElementsHandlerTest method testOnePartitionOneGroup.

@Test
public void testOnePartitionOneGroup(@TempDir java.nio.file.Path tempDir) throws OperationException, IOException, StoreException {
    // Given
    final List<Element> elementsToAdd = new ArrayList<>();
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    elementsToAdd.addAll(AggregateAndSortDataTest.generateData());
    final AddElements add = new AddElements.Builder().input(elementsToAdd).build();
    final Context context = new Context();
    final Schema schema = TestUtils.gafferSchema("schemaUsingLongVertexType");
    final ParquetStoreProperties storeProperties = new ParquetStoreProperties();
    final String testDir = tempDir.toString();
    storeProperties.setDataDir(testDir + "/data");
    storeProperties.setTempFilesDir(testDir + "/tmpdata");
    final ParquetStore store = (ParquetStore) ParquetStore.createStore("graphId", schema, storeProperties);
    final FileSystem fs = FileSystem.get(new Configuration());
    final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
    // When
    new AddElementsHandler().doOperation(add, context, store);
    // Then
    // - New snapshot directory should have been created.
    final long snapshotId = store.getLatestSnapshot();
    final Path snapshotPath = new Path(testDir + "/data", ParquetStore.getSnapshotPath(snapshotId));
    assertTrue(fs.exists(snapshotPath));
    // - There should be 1 file named partition-0.parquet (and an associated .crc file) in the "group=BasicEntity"
    // directory.
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0))));
    assertTrue(fs.exists(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/." + ParquetStore.getFile(0) + ".crc")));
    // - The files should contain the data sorted by vertex and date.
    final Row[] results = (Row[]) sparkSession.read().parquet(new Path(snapshotPath, ParquetStore.getGroupSubDir(TestGroups.ENTITY, false) + "/" + ParquetStore.getFile(0)).toString()).collect();
    assertThat(results).hasSize(40);
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals(i % 2 == 0 ? 'b' : 'a', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(i % 2 == 0 ? 8f : 6f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * 2 * (i / 2), (long) results[i].getAs("long"));
        assertEquals(i % 2 == 0 ? 14 : 12, (int) results[i].getAs("short"));
        assertEquals(i % 2 == 0 ? 100000L : 200000L, (long) results[i].getAs("date"));
        assertEquals(2, (int) results[i].getAs("count"));
        assertArrayEquals(i % 2 == 0 ? new String[] { "A", "C" } : new String[] { "A", "B" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        final FreqMap mergedFreqMap1 = new FreqMap();
        mergedFreqMap1.put("A", 2L);
        mergedFreqMap1.put("B", 2L);
        final FreqMap mergedFreqMap2 = new FreqMap();
        mergedFreqMap2.put("A", 2L);
        mergedFreqMap2.put("C", 2L);
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(i % 2 == 0 ? mergedFreqMap2 : mergedFreqMap1), results[i].getAs("freqMap"));
    }
}
Also used : AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) Context(uk.gov.gchq.gaffer.store.Context) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) Configuration(org.apache.hadoop.conf.Configuration) FreqMap(uk.gov.gchq.gaffer.types.FreqMap) Element(uk.gov.gchq.gaffer.data.element.Element) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) WrappedArray(scala.collection.mutable.WrappedArray) ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) WriteUnsortedDataTest(uk.gov.gchq.gaffer.parquetstore.utils.WriteUnsortedDataTest) Test(org.junit.jupiter.api.Test) AggregateAndSortDataTest(uk.gov.gchq.gaffer.parquetstore.utils.AggregateAndSortDataTest)

Aggregations

FreqMap (uk.gov.gchq.gaffer.types.FreqMap)62 Test (org.junit.jupiter.api.Test)29 ArrayList (java.util.ArrayList)9 Edge (uk.gov.gchq.gaffer.data.element.Edge)9 Entity (uk.gov.gchq.gaffer.data.element.Entity)9 Date (java.util.Date)6 Schema (uk.gov.gchq.gaffer.store.schema.Schema)6 FunctionTest (uk.gov.gchq.koryphe.function.FunctionTest)6 Configuration (org.apache.hadoop.conf.Configuration)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 Path (org.apache.hadoop.fs.Path)5 Row (org.apache.spark.sql.Row)5 SparkSession (org.apache.spark.sql.SparkSession)5 WrappedArray (scala.collection.mutable.WrappedArray)5 Element (uk.gov.gchq.gaffer.data.element.Element)5 HashMap (java.util.HashMap)4 Key (org.apache.accumulo.core.data.Key)4 Test (org.junit.Test)4 EdgeId (uk.gov.gchq.gaffer.data.element.id.EdgeId)4 ElementId (uk.gov.gchq.gaffer.data.element.id.ElementId)4