Search in sources :

Example 6 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class AddElementsFromRDD method aggregateNewAndOldData.

/**
 * For each group that requires aggregation, this method aggregates the new data that has been written out to file
 * with the existing data for that group.
 *
 * @throws OperationException if an {@link IOException} or a {@link SerialisationException} is thrown
 */
private void aggregateNewAndOldData() throws OperationException {
    LOGGER.info("Creating AggregateDataForGroup tasks for groups that require aggregation");
    for (final String group : schema.getAggregatedGroups()) {
        LOGGER.info("Creating AggregateDataForGroup task for group {}", group);
        final List<String> inputFiles = new ArrayList<>();
        final String groupDirectoryNewData = getDirectory(group, false, false, false);
        final FileStatus[] newData;
        try {
            newData = fs.listStatus(new Path(groupDirectoryNewData), path -> path.getName().endsWith(".parquet"));
        } catch (final IOException e) {
            throw new OperationException("IOException finding Parquet files in " + groupDirectoryNewData, e);
        }
        Arrays.stream(newData).map(f -> f.getPath().toString()).forEach(inputFiles::add);
        final List<Path> existingData;
        try {
            existingData = store.getFilesForGroup(group);
        } catch (final IOException e) {
            throw new OperationException("IOException finding files for group " + group, e);
        }
        existingData.stream().map(Path::toString).forEach(inputFiles::add);
        final String outputDir = getDirectory(group, false, true, false);
        final AggregateDataForGroup aggregateDataForGroup;
        try {
            aggregateDataForGroup = new AggregateDataForGroup(fs, schemaUtils, group, inputFiles, outputDir, spark);
        } catch (final SerialisationException e) {
            throw new OperationException("SerialisationException creating AggregateDataForGroup task", e);
        }
        LOGGER.info("AggregateDataForGroup task for group {} is being called ({} files as input, outputting to {})", group, inputFiles.size(), outputDir);
        aggregateDataForGroup.call();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) StoreException(uk.gov.gchq.gaffer.store.StoreException) FileSystem(org.apache.hadoop.fs.FileSystem) LoggerFactory(org.slf4j.LoggerFactory) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Function(java.util.function.Function) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) CalculatePartitioner(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitioner) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) SparkParquetUtils(uk.gov.gchq.gaffer.parquetstore.utils.SparkParquetUtils) SortFullGroup(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.SortFullGroup) Path(org.apache.hadoop.fs.Path) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) AggregateDataForGroup(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup) Logger(org.slf4j.Logger) WriteData(uk.gov.gchq.gaffer.parquetstore.operation.handler.spark.utilities.WriteData) SparkContextUtil(uk.gov.gchq.gaffer.spark.SparkContextUtil) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) IOException(java.io.IOException) List(java.util.List) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Context(uk.gov.gchq.gaffer.store.Context) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) OperationException(uk.gov.gchq.gaffer.operation.OperationException) RDD(org.apache.spark.rdd.RDD) FileStatus(org.apache.hadoop.fs.FileStatus) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) AggregateDataForGroup(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateDataForGroup) ArrayList(java.util.ArrayList) IOException(java.io.IOException) OperationException(uk.gov.gchq.gaffer.operation.OperationException)

Example 7 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class CalculatePartitioner method call.

public GraphPartitioner call() throws IOException {
    final SchemaUtils schemaUtils = new SchemaUtils(schema);
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    for (final String group : schema.getGroups()) {
        LOGGER.info("Calculating GroupPartitioner for group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, false));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(schema.getEntityGroups().contains(group)).usingConverter(converter).build();
            // NB Should never be null as empty files are removed before this is called
            final Element element = reader.read();
            if (null == element) {
                throw new IOException("No first element in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjects(element);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitioner(group, groupPartitioner);
        LOGGER.info("GroupPartitioner for group {} is {}", group, groupPartitioner);
    }
    for (final String group : schema.getEdgeGroups()) {
        LOGGER.info("Calculating GroupPartitioner for reversed edge group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, true));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(false).usingConverter(converter).build();
            final Edge edge = (Edge) reader.read();
            if (null == edge) {
                throw new IOException("No first edge in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjectsForReversedEdge(edge);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitionerForReversedEdges(group, groupPartitioner);
    }
    return graphPartitioner;
}
Also used : Arrays(java.util.Arrays) Logger(org.slf4j.Logger) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) SortedSet(java.util.SortedSet) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReader(org.apache.parquet.hadoop.ParquetReader) LoggerFactory(org.slf4j.LoggerFactory) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Element(uk.gov.gchq.gaffer.data.element.Element) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) TreeSet(java.util.TreeSet) Path(org.apache.hadoop.fs.Path) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) IOException(java.io.IOException) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Edge(uk.gov.gchq.gaffer.data.element.Edge)

Example 8 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class RetrieveElementsFromFile method call.

@Override
public OperationException call() throws Exception {
    if (null == elementFilter) {
        elementFilter = new ViewElementDefinition.Builder().json(elementDefinitionJson).build().getPreAggregationFilter();
    }
    if (null == schemaUtils) {
        schemaUtils = new SchemaUtils(Schema.fromJson(jsonGafferSchema));
    }
    try {
        final ParquetReader<Element> fileReader = openParquetReader();
        Element e = fileReader.read();
        while (null != e) {
            if (!visibility.isEmpty()) {
                if (isVisible(e)) {
                    if (needsValidatorsAndFiltersApplying) {
                        final String group = e.getGroup();
                        final ElementFilter validatorFilter = gafferSchema.getElement(group).getValidator(false);
                        if (skipValidation || validatorFilter == null || validatorFilter.test(e)) {
                            if (elementFilter == null || elementFilter.test(e)) {
                                ViewUtil.removeProperties(view, e);
                                queue.add(e);
                            }
                        }
                    } else {
                        ViewUtil.removeProperties(view, e);
                        queue.add(e);
                    }
                }
            } else if (needsValidatorsAndFiltersApplying) {
                final String group = e.getGroup();
                final ElementFilter validatorFilter = gafferSchema.getElement(group).getValidator(false);
                if (skipValidation || validatorFilter == null || validatorFilter.test(e)) {
                    if (elementFilter == null || elementFilter.test(e)) {
                        ViewUtil.removeProperties(view, e);
                        queue.add(e);
                    }
                }
            } else {
                ViewUtil.removeProperties(view, e);
                queue.add(e);
            }
            e = fileReader.read();
        }
        fileReader.close();
    } catch (final IOException ignore) {
        LOGGER.error("IOException reading file", ignore);
    // ignore as this file does not exist
    }
    return null;
}
Also used : SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) Element(uk.gov.gchq.gaffer.data.element.Element) ElementFilter(uk.gov.gchq.gaffer.data.element.function.ElementFilter) IOException(java.io.IOException)

Example 9 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class GetDataFrameOfElementsHandler method doOperation.

private Dataset<Row> doOperation(final GetDataFrameOfElements operation, final ParquetStore store, final SparkSession spark) throws OperationException {
    if (!operation.getView().equals(new View.Builder().entities(store.getSchema().getEntityGroups()).edges(store.getSchema().getEdgeGroups()).build())) {
        throw new OperationException("This operation does not currently support views");
    }
    LOGGER.debug("Creating a Dataset<Row> from path {} with option mergeSchema=true", store.getGraphPath());
    final StructType schema = new SchemaUtils(store.getSchema()).getMergedSparkSchema(store.getSchema().getGroups());
    final Dataset<Row> dataframe = spark.read().schema(schema).parquet(store.getGraphPath());
    return dataframe;
}
Also used : SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) StructType(org.apache.spark.sql.types.StructType) Row(org.apache.spark.sql.Row) View(uk.gov.gchq.gaffer.data.elementdefinition.view.View) OperationException(uk.gov.gchq.gaffer.operation.OperationException)

Example 10 with SchemaUtils

use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.

the class WriteData method call.

// Not private to allow tests to call it
void call(final Iterator<Element> elements, final int partitionId, final long taskAttemptId) throws Exception {
    final SchemaUtils schemaUtils = new SchemaUtils(Schema.fromJson(schemaAsJson));
    final Map<String, ParquetWriter<Element>> groupToWriter = new HashMap<>();
    final Map<String, Path> groupToWriterPath = new HashMap<>();
    for (final String group : schemaUtils.getGroups()) {
        groupToWriterPath.put(group, new Path(groupToDirectory.get(group) + "/input-" + partitionId + "-" + taskAttemptId + ".parquet"));
        groupToWriter.put(group, buildWriter(group, groupToWriterPath.get(group), schemaUtils));
    }
    writeData(elements, partitionId, taskAttemptId, groupToWriter);
    renameFiles(partitionId, taskAttemptId, schemaUtils.getGroups(), groupToWriterPath);
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) HashMap(java.util.HashMap)

Aggregations

SchemaUtils (uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils)11 Path (org.apache.hadoop.fs.Path)8 ArrayList (java.util.ArrayList)7 List (java.util.List)6 ParquetStore (uk.gov.gchq.gaffer.parquetstore.ParquetStore)6 IOException (java.io.IOException)5 FileSystem (org.apache.hadoop.fs.FileSystem)4 Test (org.junit.jupiter.api.Test)4 ParquetStoreProperties (uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties)4 GraphPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)4 Schema (uk.gov.gchq.gaffer.store.schema.Schema)4 FileStatus (org.apache.hadoop.fs.FileStatus)3 Logger (org.slf4j.Logger)3 LoggerFactory (org.slf4j.LoggerFactory)3 View (uk.gov.gchq.gaffer.data.elementdefinition.view.View)3 OperationException (uk.gov.gchq.gaffer.operation.OperationException)3 LongVertexOperationsTest (uk.gov.gchq.gaffer.parquetstore.operation.handler.LongVertexOperationsTest)3 CalculatePartitionerTest (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitionerTest)3 StoreException (uk.gov.gchq.gaffer.store.StoreException)3 Arrays (java.util.Arrays)2