Search in sources :

Example 6 with GafferGroupObjectConverter

use of uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter in project Gaffer by gchq.

the class CalculatePartitioner method call.

public GraphPartitioner call() throws IOException {
    final SchemaUtils schemaUtils = new SchemaUtils(schema);
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    for (final String group : schema.getGroups()) {
        LOGGER.info("Calculating GroupPartitioner for group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, false));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(schema.getEntityGroups().contains(group)).usingConverter(converter).build();
            // NB Should never be null as empty files are removed before this is called
            final Element element = reader.read();
            if (null == element) {
                throw new IOException("No first element in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjects(element);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitioner(group, groupPartitioner);
        LOGGER.info("GroupPartitioner for group {} is {}", group, groupPartitioner);
    }
    for (final String group : schema.getEdgeGroups()) {
        LOGGER.info("Calculating GroupPartitioner for reversed edge group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, true));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(false).usingConverter(converter).build();
            final Edge edge = (Edge) reader.read();
            if (null == edge) {
                throw new IOException("No first edge in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjectsForReversedEdge(edge);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitionerForReversedEdges(group, groupPartitioner);
    }
    return graphPartitioner;
}
Also used : Arrays(java.util.Arrays) Logger(org.slf4j.Logger) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) SortedSet(java.util.SortedSet) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReader(org.apache.parquet.hadoop.ParquetReader) LoggerFactory(org.slf4j.LoggerFactory) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Element(uk.gov.gchq.gaffer.data.element.Element) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) TreeSet(java.util.TreeSet) Path(org.apache.hadoop.fs.Path) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) IOException(java.io.IOException) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Edge(uk.gov.gchq.gaffer.data.element.Edge)

Example 7 with GafferGroupObjectConverter

use of uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter in project Gaffer by gchq.

the class RetrieveElementsFromFile method openParquetReader.

private ParquetReader<Element> openParquetReader() throws IOException {
    final boolean isEntity = schemaUtils.getEntityGroups().contains(group);
    final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
    LOGGER.debug("Opening a new Parquet reader for file {}", filePath);
    if (null != filter) {
        return new ParquetElementReader.Builder<Element>(filePath).isEntity(isEntity).usingConverter(converter).withFilter(FilterCompat.get(filter)).build();
    } else {
        return new ParquetElementReader.Builder<Element>(filePath).isEntity(isEntity).usingConverter(converter).build();
    }
}
Also used : ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) Element(uk.gov.gchq.gaffer.data.element.Element) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter)

Aggregations

GafferGroupObjectConverter (uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter)7 Element (uk.gov.gchq.gaffer.data.element.Element)4 ArrayList (java.util.ArrayList)3 PartitionKey (uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey)3 Map (java.util.Map)2 GenericRowWithSchema (org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)2 ParquetElementReader (uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader)2 FreqMap (uk.gov.gchq.gaffer.types.FreqMap)2 IOException (java.io.IOException)1 Arrays (java.util.Arrays)1 List (java.util.List)1 SortedSet (java.util.SortedSet)1 TreeSet (java.util.TreeSet)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 ParquetReader (org.apache.parquet.hadoop.ParquetReader)1 Logger (org.slf4j.Logger)1 LoggerFactory (org.slf4j.LoggerFactory)1 Edge (uk.gov.gchq.gaffer.data.element.Edge)1