Search in sources :

Example 1 with GraphPartitionerSerialiser

use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.

the class ParquetStore method initialise.

private void initialise() throws IOException, StoreException {
    // If data directory is empty or does not exist then this is the first time the store has been created.
    final Path dataDirPath = new Path(getDataDir());
    if (!fs.exists(dataDirPath) || 0 == fs.listStatus(dataDirPath).length) {
        LOGGER.info("Data directory {} doesn't exist or is empty so initialising directory structure", dataDirPath);
        currentSnapshot = System.currentTimeMillis();
        LOGGER.info("Initialising snapshot id to {}", currentSnapshot);
        final Path snapshotPath = new Path(dataDirPath, getSnapshotPath(currentSnapshot));
        LOGGER.info("Creating snapshot directory {}", snapshotPath);
        fs.mkdirs(snapshotPath);
        LOGGER.info("Creating group directories under {}", snapshotPath);
        for (final String group : getSchema().getGroups()) {
            final Path groupDir = getGroupPath(group);
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        LOGGER.info("Creating group directories for reversed edges under {}", snapshotPath);
        for (final String group : getSchema().getEdgeGroups()) {
            final Path groupDir = getGroupPathForReversedEdges(group);
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        LOGGER.info("Creating GraphPartitioner with 0 split points for each group");
        graphPartitioner = new GraphPartitioner();
        for (final String group : getSchema().getGroups()) {
            graphPartitioner.addGroupPartitioner(group, new GroupPartitioner(group, new ArrayList<>()));
        }
        for (final String group : getSchema().getEdgeGroups()) {
            graphPartitioner.addGroupPartitionerForReversedEdges(group, new GroupPartitioner(group, new ArrayList<>()));
        }
        LOGGER.info("Writing GraphPartitioner to snapshot directory");
        final FSDataOutputStream dataOutputStream = fs.create(getGraphPartitionerPath());
        new GraphPartitionerSerialiser().write(graphPartitioner, dataOutputStream);
        dataOutputStream.close();
        LOGGER.info("Wrote GraphPartitioner to file {}", getGraphPartitionerPath().toString());
    } else {
        LOGGER.info("Data directory {} exists and is non-empty, validating a snapshot directory exists", dataDirPath);
        final FileStatus[] fileStatuses = fs.listStatus(dataDirPath, f -> f.getName().startsWith(SNAPSHOT + "="));
        final List<FileStatus> directories = Arrays.stream(fileStatuses).filter(f -> f.isDirectory()).collect(Collectors.toList());
        if (0 == directories.size()) {
            LOGGER.error("Data directory {} should contain a snapshot directory", dataDirPath);
            throw new StoreException("Data directory should contain a snapshot directory");
        }
        this.currentSnapshot = getLatestSnapshot();
        LOGGER.info("Latest snapshot directory in data directory {} is {}", dataDirPath, this.currentSnapshot);
        LOGGER.info("Verifying snapshot directory contains the correct directories");
        for (final String group : getSchema().getGroups()) {
            final Path groupDir = getGroupPath(group);
            if (!fs.exists(groupDir)) {
                LOGGER.error("Directory {} should exist", groupDir);
                throw new StoreException("Group directory " + groupDir + " should exist in snapshot directory " + getSnapshotPath(this.currentSnapshot));
            }
        }
        for (final String group : getSchema().getEdgeGroups()) {
            final Path groupDir = getGroupPathForReversedEdges(group);
            if (!fs.exists(groupDir)) {
                LOGGER.error("Directory {} should exist", groupDir);
                throw new StoreException("Group directory " + groupDir + " should exist in snapshot directory " + getSnapshotPath(this.currentSnapshot));
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) GetElementsHandler(uk.gov.gchq.gaffer.parquetstore.operation.handler.GetElementsHandler) InLineHyperLogLogPlusParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.InLineHyperLogLogPlusParquetSerialiser) Arrays(java.util.Arrays) BooleanParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.BooleanParquetSerialiser) DoubleParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.DoubleParquetSerialiser) FileSystem(org.apache.hadoop.fs.FileSystem) GetAdjacentIdsHandler(uk.gov.gchq.gaffer.parquetstore.operation.handler.GetAdjacentIdsHandler) LoggerFactory(org.slf4j.LoggerFactory) ByteParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.ByteParquetSerialiser) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) SchemaOptimiser(uk.gov.gchq.gaffer.store.schema.SchemaOptimiser) FloatParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.FloatParquetSerialiser) CloseableIterable(uk.gov.gchq.gaffer.commonutil.iterable.CloseableIterable) Configuration(org.apache.hadoop.conf.Configuration) TreeSetStringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.TreeSetStringParquetSerialiser) Path(org.apache.hadoop.fs.Path) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) SerialisationFactory(uk.gov.gchq.gaffer.store.SerialisationFactory) ValidationResult(uk.gov.gchq.koryphe.ValidationResult) Partition(uk.gov.gchq.gaffer.parquetstore.partitioner.Partition) ImportJavaRDDOfElements(uk.gov.gchq.gaffer.spark.operation.javardd.ImportJavaRDDOfElements) GetDataFrameOfElements(uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) Set(java.util.Set) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) Store(uk.gov.gchq.gaffer.store.Store) PRE_AGGREGATION_FILTERING(uk.gov.gchq.gaffer.store.StoreTrait.PRE_AGGREGATION_FILTERING) List(java.util.List) INGEST_AGGREGATION(uk.gov.gchq.gaffer.store.StoreTrait.INGEST_AGGREGATION) Entry(java.util.Map.Entry) AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) StoreTrait(uk.gov.gchq.gaffer.store.StoreTrait) OutputOperationHandler(uk.gov.gchq.gaffer.store.operation.handler.OutputOperationHandler) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings) SchemaElementDefinition(uk.gov.gchq.gaffer.store.schema.SchemaElementDefinition) StoreException(uk.gov.gchq.gaffer.store.StoreException) Serialiser(uk.gov.gchq.gaffer.serialisation.Serialiser) ORDERED(uk.gov.gchq.gaffer.store.StoreTrait.ORDERED) GetElements(uk.gov.gchq.gaffer.operation.impl.get.GetElements) TypeSubTypeValueParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.TypeSubTypeValueParquetSerialiser) ImportJavaRDDOfElementsHandler(uk.gov.gchq.gaffer.parquetstore.operation.handler.spark.ImportJavaRDDOfElementsHandler) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) ArrayList(java.util.ArrayList) CalculatePartitioner(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitioner) TypeValueParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.TypeValueParquetSerialiser) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) IntegerParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.IntegerParquetSerialiser) JavaSerialiser(uk.gov.gchq.gaffer.serialisation.implementation.JavaSerialiser) LongParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.LongParquetSerialiser) ImportRDDOfElementsHandler(uk.gov.gchq.gaffer.parquetstore.operation.handler.spark.ImportRDDOfElementsHandler) Logger(org.slf4j.Logger) DateParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.DateParquetSerialiser) StringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser) IOException(java.io.IOException) GetAllElementsHandler(uk.gov.gchq.gaffer.parquetstore.operation.handler.GetAllElementsHandler) StoreProperties(uk.gov.gchq.gaffer.store.StoreProperties) FreqMapParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.FreqMapParquetSerialiser) IdentifierType(uk.gov.gchq.gaffer.data.element.IdentifierType) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) AddElementsHandler(uk.gov.gchq.gaffer.parquetstore.operation.handler.AddElementsHandler) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GetDataFrameOfElementsHandler(uk.gov.gchq.gaffer.parquetstore.operation.handler.spark.GetDataFrameOfElementsHandler) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) ImportRDDOfElements(uk.gov.gchq.gaffer.spark.operation.scalardd.ImportRDDOfElements) HashSetStringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.HashSetStringParquetSerialiser) OperationHandler(uk.gov.gchq.gaffer.store.operation.handler.OperationHandler) ArrayListStringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.ArrayListStringParquetSerialiser) ShortParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.ShortParquetSerialiser) Collections(java.util.Collections) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) StoreException(uk.gov.gchq.gaffer.store.StoreException)

Example 2 with GraphPartitionerSerialiser

use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.

the class AddElementsHandler method addElements.

private void addElements(final AddElements addElementsOperation, final Context context, final ParquetStore store) throws OperationException {
    // Set up
    final FileSystem fs = store.getFS();
    final Schema schema = store.getSchema();
    final SchemaUtils schemaUtils = store.getSchemaUtils();
    final SparkSession spark = SparkContextUtil.getSparkSession(context, store.getProperties());
    final ExecutorService threadPool = createThreadPool(spark, store.getProperties());
    final GraphPartitioner currentGraphPartitioner = store.getGraphPartitioner();
    SparkParquetUtils.configureSparkForAddElements(spark, store.getProperties());
    // Write data from addElementsOperation split by group and partition (NB this uses the existing partitioner -
    // adding elements using this operation does not effect the partitions).
    final String tmpDirectory = store.getTempFilesDir();
    final BiFunction<String, Integer, String> directoryForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/group=" + group + "/partition=" + partitionId;
    final BiFunction<String, Integer, String> directoryForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/reversed-group=" + group + "/partition=" + partitionId;
    LOGGER.info("Calling WriteUnsortedData to add elements");
    LOGGER.trace("currentGraphPartitioner is {}", currentGraphPartitioner);
    new WriteUnsortedData(store, currentGraphPartitioner, directoryForGroupAndPartitionId, directoryForGroupAndPartitionIdForReversedEdges).writeElements(addElementsOperation.getInput());
    // For every group and partition, aggregate the new data with the old data and then sort
    final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/group=" + group + "/partition=" + partitionId;
    final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/REVERSED-group=" + group + "/partition=" + partitionId;
    final List<Callable<CallableResult>> tasks = new ArrayList<>();
    for (final String group : schema.getGroups()) {
        final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
        for (final Partition partition : partitions) {
            final List<String> inputFiles = new ArrayList<>();
            // New data
            inputFiles.add(directoryForGroupAndPartitionId.apply(group, partition.getPartitionId()));
            // Old data
            inputFiles.add(store.getFile(group, partition));
            final String outputDir = directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId());
            final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, group + "-" + partition.getPartitionId(), false, store.getProperties().getCompressionCodecName(), spark);
            tasks.add(task);
            LOGGER.info("Created AggregateAndSortData task for group {}, partition {}", group, partition.getPartitionId());
        }
    }
    for (final String group : schema.getEdgeGroups()) {
        final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
        for (final Partition partition : partitions) {
            final List<String> inputFiles = new ArrayList<>();
            // New data
            inputFiles.add(directoryForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
            // Old data
            inputFiles.add(store.getFileForReversedEdges(group, partition));
            final String outputDir = directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId());
            final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, "reversed-" + group + "-" + partition.getPartitionId(), true, store.getProperties().getCompressionCodecName(), spark);
            tasks.add(task);
            LOGGER.info("Created AggregateAndSortData task for reversed edge group {}, partition {}", group, partition.getPartitionId());
        }
    }
    try {
        LOGGER.info("Invoking {} AggregateAndSortData tasks", tasks.size());
        final List<Future<CallableResult>> futures = threadPool.invokeAll(tasks);
        for (final Future<CallableResult> future : futures) {
            final CallableResult result = future.get();
            LOGGER.info("Result {} from task", result);
        }
    } catch (final InterruptedException e) {
        throw new OperationException("InterruptedException running AggregateAndSortData tasks", e);
    } catch (final ExecutionException e) {
        throw new OperationException("ExecutionException running AggregateAndSortData tasks", e);
    }
    try {
        // Move results to a new snapshot directory (the -tmp at the end allows us to add data to the directory,
        // and then when this is all finished we rename the directory to remove the -tmp; this allows us to make
        // the replacement of the old data with the new data an atomic operation and ensures that a get operation
        // against the store will not read the directory when only some of the data has been moved there).
        final long snapshot = System.currentTimeMillis();
        final String newDataDir = store.getDataDir() + "/" + ParquetStore.getSnapshotPath(snapshot) + "-tmp";
        LOGGER.info("Moving aggregated and sorted data to new snapshot directory {}", newDataDir);
        fs.mkdirs(new Path(newDataDir));
        for (final String group : schema.getGroups()) {
            final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, false));
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        for (final String group : schema.getEdgeGroups()) {
            final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, true));
            fs.mkdirs(groupDir);
            LOGGER.info("Created directory {}", groupDir);
        }
        for (final String group : schema.getGroups()) {
            final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, false);
            final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
            for (final Partition partition : partitions) {
                final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId()));
                if (!fs.exists(outputDir)) {
                    LOGGER.info("Not moving data for group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
                } else {
                    // One .parquet file and one .parquet.crc file
                    final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
                    if (1 != status.length) {
                        LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
                        throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
                    } else {
                        final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
                        LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
                        fs.rename(status[0].getPath(), destination);
                    }
                }
            }
        }
        for (final String group : schema.getEdgeGroups()) {
            final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, true);
            final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
            for (final Partition partition : partitions) {
                final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
                if (!fs.exists(outputDir)) {
                    LOGGER.info("Not moving data for reversed edge group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
                } else {
                    // One .parquet file and one .parquet.crc file
                    final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
                    if (1 != status.length) {
                        LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
                        throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
                    } else {
                        final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
                        LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
                        fs.rename(status[0].getPath(), destination);
                    }
                }
            }
        }
        // Delete temporary data directory
        LOGGER.info("Deleting temporary directory {}", tmpDirectory);
        fs.delete(new Path(tmpDirectory), true);
        // Write out graph partitioner (unchanged from previous one)
        final Path newGraphPartitionerPath = new Path(newDataDir + "/graphPartitioner");
        final FSDataOutputStream stream = fs.create(newGraphPartitionerPath);
        LOGGER.info("Writing graph partitioner to {}", newGraphPartitionerPath);
        new GraphPartitionerSerialiser().write(currentGraphPartitioner, stream);
        stream.close();
        // Move snapshot-tmp directory to snapshot
        final String directoryWithoutTmp = newDataDir.substring(0, newDataDir.lastIndexOf("-tmp"));
        LOGGER.info("Renaming {} to {}", newDataDir, directoryWithoutTmp);
        fs.rename(new Path(newDataDir), new Path(directoryWithoutTmp));
        // Set snapshot on store to new value
        LOGGER.info("Updating latest snapshot on store to {}", snapshot);
        store.setLatestSnapshot(snapshot);
    } catch (final IOException | StoreException e) {
        throw new OperationException("IOException moving results files into new snapshot directory", e);
    }
}
Also used : ParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.ParquetStoreProperties) StoreException(uk.gov.gchq.gaffer.store.StoreException) FileSystem(org.apache.hadoop.fs.FileSystem) AggregateAndSortData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateAndSortData) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) Callable(java.util.concurrent.Callable) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Future(java.util.concurrent.Future) SparkParquetUtils(uk.gov.gchq.gaffer.parquetstore.utils.SparkParquetUtils) Path(org.apache.hadoop.fs.Path) ExecutorService(java.util.concurrent.ExecutorService) SparkSession(org.apache.spark.sql.SparkSession) Logger(org.slf4j.Logger) Partition(uk.gov.gchq.gaffer.parquetstore.partitioner.Partition) SparkContextUtil(uk.gov.gchq.gaffer.spark.SparkContextUtil) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) IOException(java.io.IOException) Option(scala.Option) Executors(java.util.concurrent.Executors) ExecutionException(java.util.concurrent.ExecutionException) Store(uk.gov.gchq.gaffer.store.Store) List(java.util.List) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Context(uk.gov.gchq.gaffer.store.Context) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) OperationException(uk.gov.gchq.gaffer.operation.OperationException) OperationHandler(uk.gov.gchq.gaffer.store.operation.handler.OperationHandler) CallableResult(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CallableResult) SparkSession(org.apache.spark.sql.SparkSession) FileStatus(org.apache.hadoop.fs.FileStatus) Schema(uk.gov.gchq.gaffer.store.schema.Schema) ArrayList(java.util.ArrayList) AggregateAndSortData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.AggregateAndSortData) Callable(java.util.concurrent.Callable) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) ExecutionException(java.util.concurrent.ExecutionException) OperationException(uk.gov.gchq.gaffer.operation.OperationException) Path(org.apache.hadoop.fs.Path) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) Partition(uk.gov.gchq.gaffer.parquetstore.partitioner.Partition) WriteUnsortedData(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.WriteUnsortedData) IOException(java.io.IOException) StoreException(uk.gov.gchq.gaffer.store.StoreException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) CallableResult(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CallableResult)

Example 3 with GraphPartitionerSerialiser

use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.

the class AddElementsFromRDD method calculateAndWritePartitioner.

/**
 * Calculates the new graph partitioner and writes it to file.
 *
 * @throws OperationException if an {@link IOException} is thrown
 */
private void calculateAndWritePartitioner() throws OperationException {
    // Create new graph partitioner
    LOGGER.info("Calculating new GraphPartitioner");
    final GraphPartitioner newPartitioner;
    try {
        newPartitioner = new CalculatePartitioner(new Path(getSortedAggregatedDirectory(true, true)), store.getSchema(), fs).call();
    } catch (final IOException e) {
        throw new OperationException("IOException calculating new graph partitioner", e);
    }
    LOGGER.info("New GraphPartitioner has partitions for {} groups, {} reversed edge groups", newPartitioner.getGroups().size(), newPartitioner.getGroupsForReversedEdges().size());
    // Write out graph partitioner
    Path newGraphPartitionerPath = null;
    try {
        newGraphPartitionerPath = new Path(getSortedAggregatedDirectory(true, true) + "graphPartitioner");
        final FSDataOutputStream stream = fs.create(newGraphPartitionerPath);
        LOGGER.info("Writing graph partitioner to {}", newGraphPartitionerPath);
        new GraphPartitionerSerialiser().write(newPartitioner, stream);
        stream.close();
    } catch (final IOException e) {
        throw new OperationException("IOException writing out graph partitioner to " + newGraphPartitionerPath, e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CalculatePartitioner(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitioner) IOException(java.io.IOException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) OperationException(uk.gov.gchq.gaffer.operation.OperationException)

Example 4 with GraphPartitionerSerialiser

use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.

the class ParquetStore method loadGraphPartitioner.

private void loadGraphPartitioner() throws StoreException {
    final String dataDir = getDataDir();
    try {
        if (fs.exists(new Path(dataDir))) {
            this.currentSnapshot = getLatestSnapshot(dataDir);
            LOGGER.info("Setting currentSnapshot to {}", this.currentSnapshot);
            final Path path = getGraphPartitionerPath();
            if (!fs.exists(path)) {
                LOGGER.info("Graph partitioner does not exist in {} so creating it", path);
                final GraphPartitioner partitioner = new CalculatePartitioner(new Path(dataDir + "/" + getSnapshotPath(this.currentSnapshot)), getSchema(), fs).call();
                LOGGER.info("Writing graph partitioner to {}", path);
                final FSDataOutputStream stream = fs.create(path);
                new GraphPartitionerSerialiser().write(partitioner, stream);
                stream.close();
            }
            LOGGER.info("Loading graph partitioner from path {}", path);
            loadGraphPartitioner(path);
        } else {
            throw new StoreException("Data directory " + dataDir + " does not exist - store is in an inconsistent state");
        }
    } catch (final IOException e) {
        throw new StoreException(e.getMessage(), e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) CalculatePartitioner(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitioner) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) IOException(java.io.IOException) StoreException(uk.gov.gchq.gaffer.store.StoreException)

Example 5 with GraphPartitionerSerialiser

use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.

the class ParquetStore method loadGraphPartitioner.

private void loadGraphPartitioner(final Path graphPartitionerPath) throws IOException {
    final FSDataInputStream stream = fs.open(graphPartitionerPath);
    this.graphPartitioner = new GraphPartitionerSerialiser().read(stream);
    stream.close();
}
Also used : GraphPartitionerSerialiser(uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Aggregations

GraphPartitionerSerialiser (uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser)5 IOException (java.io.IOException)4 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)4 Path (org.apache.hadoop.fs.Path)4 GraphPartitioner (uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner)4 StoreException (uk.gov.gchq.gaffer.store.StoreException)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Logger (org.slf4j.Logger)2 LoggerFactory (org.slf4j.LoggerFactory)2 OperationException (uk.gov.gchq.gaffer.operation.OperationException)2 AddElements (uk.gov.gchq.gaffer.operation.impl.add.AddElements)2 CalculatePartitioner (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.CalculatePartitioner)2 Partition (uk.gov.gchq.gaffer.parquetstore.partitioner.Partition)2 SchemaUtils (uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils)2 Store (uk.gov.gchq.gaffer.store.Store)2 OperationHandler (uk.gov.gchq.gaffer.store.operation.handler.OperationHandler)2