Search in sources :

Example 11 with PartitionKey

use of uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey in project Gaffer by gchq.

the class PartitionKeySerialiserTest method shouldCreatePartitionKey.

@Test
public void shouldCreatePartitionKey(@TempDir Path tempDir) throws IOException {
    // Given
    final Object[] key = new Object[] { true, 1L, 5, "ABC", 10F, (short) 1, (byte) 64, new byte[] { (byte) 1, (byte) 2, (byte) 3 } };
    final PartitionKey partitionKey = new PartitionKey(key);
    final PartitionKeySerialiser serialiser = new PartitionKeySerialiser();
    // When
    final String filename = tempDir.resolve("test").toString();
    final DataOutputStream dos = new DataOutputStream(new FileOutputStream(filename));
    serialiser.write(partitionKey, dos);
    dos.close();
    final DataInputStream dis = new DataInputStream(new FileInputStream(filename));
    final PartitionKey readPartitionKey = serialiser.read(dis);
    dis.close();
    // Then
    assertArrayEquals(key, readPartitionKey.getPartitionKey());
}
Also used : DataOutputStream(java.io.DataOutputStream) FileOutputStream(java.io.FileOutputStream) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) PositiveInfinityPartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PositiveInfinityPartitionKey) NegativeInfinityPartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.NegativeInfinityPartitionKey) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) Test(org.junit.jupiter.api.Test)

Example 12 with PartitionKey

use of uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey in project Gaffer by gchq.

the class CalculatePartitioner method call.

public GraphPartitioner call() throws IOException {
    final SchemaUtils schemaUtils = new SchemaUtils(schema);
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    for (final String group : schema.getGroups()) {
        LOGGER.info("Calculating GroupPartitioner for group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, false));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(schema.getEntityGroups().contains(group)).usingConverter(converter).build();
            // NB Should never be null as empty files are removed before this is called
            final Element element = reader.read();
            if (null == element) {
                throw new IOException("No first element in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjects(element);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitioner(group, groupPartitioner);
        LOGGER.info("GroupPartitioner for group {} is {}", group, groupPartitioner);
    }
    for (final String group : schema.getEdgeGroups()) {
        LOGGER.info("Calculating GroupPartitioner for reversed edge group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, true));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(false).usingConverter(converter).build();
            final Edge edge = (Edge) reader.read();
            if (null == edge) {
                throw new IOException("No first edge in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjectsForReversedEdge(edge);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitionerForReversedEdges(group, groupPartitioner);
    }
    return graphPartitioner;
}
Also used : Arrays(java.util.Arrays) Logger(org.slf4j.Logger) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) SortedSet(java.util.SortedSet) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReader(org.apache.parquet.hadoop.ParquetReader) LoggerFactory(org.slf4j.LoggerFactory) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Element(uk.gov.gchq.gaffer.data.element.Element) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) TreeSet(java.util.TreeSet) Path(org.apache.hadoop.fs.Path) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) IOException(java.io.IOException) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Edge(uk.gov.gchq.gaffer.data.element.Edge)

Aggregations

PartitionKey (uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey)12 Test (org.junit.jupiter.api.Test)8 ArrayList (java.util.ArrayList)6 GroupPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner)6 DataInputStream (java.io.DataInputStream)5 DataOutputStream (java.io.DataOutputStream)5 FileInputStream (java.io.FileInputStream)5 FileOutputStream (java.io.FileOutputStream)5 Element (uk.gov.gchq.gaffer.data.element.Element)5 GraphPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 IOException (java.io.IOException)3 Arrays (java.util.Arrays)3 List (java.util.List)3 Configuration (org.apache.hadoop.conf.Configuration)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 ParquetReader (org.apache.parquet.hadoop.ParquetReader)3 Edge (uk.gov.gchq.gaffer.data.element.Edge)3 ParquetElementReader (uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader)3