use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.
the class ParquetStore method initialise.
private void initialise() throws IOException, StoreException {
// If data directory is empty or does not exist then this is the first time the store has been created.
final Path dataDirPath = new Path(getDataDir());
if (!fs.exists(dataDirPath) || 0 == fs.listStatus(dataDirPath).length) {
LOGGER.info("Data directory {} doesn't exist or is empty so initialising directory structure", dataDirPath);
currentSnapshot = System.currentTimeMillis();
LOGGER.info("Initialising snapshot id to {}", currentSnapshot);
final Path snapshotPath = new Path(dataDirPath, getSnapshotPath(currentSnapshot));
LOGGER.info("Creating snapshot directory {}", snapshotPath);
fs.mkdirs(snapshotPath);
LOGGER.info("Creating group directories under {}", snapshotPath);
for (final String group : getSchema().getGroups()) {
final Path groupDir = getGroupPath(group);
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
LOGGER.info("Creating group directories for reversed edges under {}", snapshotPath);
for (final String group : getSchema().getEdgeGroups()) {
final Path groupDir = getGroupPathForReversedEdges(group);
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
LOGGER.info("Creating GraphPartitioner with 0 split points for each group");
graphPartitioner = new GraphPartitioner();
for (final String group : getSchema().getGroups()) {
graphPartitioner.addGroupPartitioner(group, new GroupPartitioner(group, new ArrayList<>()));
}
for (final String group : getSchema().getEdgeGroups()) {
graphPartitioner.addGroupPartitionerForReversedEdges(group, new GroupPartitioner(group, new ArrayList<>()));
}
LOGGER.info("Writing GraphPartitioner to snapshot directory");
final FSDataOutputStream dataOutputStream = fs.create(getGraphPartitionerPath());
new GraphPartitionerSerialiser().write(graphPartitioner, dataOutputStream);
dataOutputStream.close();
LOGGER.info("Wrote GraphPartitioner to file {}", getGraphPartitionerPath().toString());
} else {
LOGGER.info("Data directory {} exists and is non-empty, validating a snapshot directory exists", dataDirPath);
final FileStatus[] fileStatuses = fs.listStatus(dataDirPath, f -> f.getName().startsWith(SNAPSHOT + "="));
final List<FileStatus> directories = Arrays.stream(fileStatuses).filter(f -> f.isDirectory()).collect(Collectors.toList());
if (0 == directories.size()) {
LOGGER.error("Data directory {} should contain a snapshot directory", dataDirPath);
throw new StoreException("Data directory should contain a snapshot directory");
}
this.currentSnapshot = getLatestSnapshot();
LOGGER.info("Latest snapshot directory in data directory {} is {}", dataDirPath, this.currentSnapshot);
LOGGER.info("Verifying snapshot directory contains the correct directories");
for (final String group : getSchema().getGroups()) {
final Path groupDir = getGroupPath(group);
if (!fs.exists(groupDir)) {
LOGGER.error("Directory {} should exist", groupDir);
throw new StoreException("Group directory " + groupDir + " should exist in snapshot directory " + getSnapshotPath(this.currentSnapshot));
}
}
for (final String group : getSchema().getEdgeGroups()) {
final Path groupDir = getGroupPathForReversedEdges(group);
if (!fs.exists(groupDir)) {
LOGGER.error("Directory {} should exist", groupDir);
throw new StoreException("Group directory " + groupDir + " should exist in snapshot directory " + getSnapshotPath(this.currentSnapshot));
}
}
}
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.
the class AddElementsHandler method addElements.
private void addElements(final AddElements addElementsOperation, final Context context, final ParquetStore store) throws OperationException {
// Set up
final FileSystem fs = store.getFS();
final Schema schema = store.getSchema();
final SchemaUtils schemaUtils = store.getSchemaUtils();
final SparkSession spark = SparkContextUtil.getSparkSession(context, store.getProperties());
final ExecutorService threadPool = createThreadPool(spark, store.getProperties());
final GraphPartitioner currentGraphPartitioner = store.getGraphPartitioner();
SparkParquetUtils.configureSparkForAddElements(spark, store.getProperties());
// Write data from addElementsOperation split by group and partition (NB this uses the existing partitioner -
// adding elements using this operation does not effect the partitions).
final String tmpDirectory = store.getTempFilesDir();
final BiFunction<String, Integer, String> directoryForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/group=" + group + "/partition=" + partitionId;
final BiFunction<String, Integer, String> directoryForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/unsorted_unaggregated_new" + "/reversed-group=" + group + "/partition=" + partitionId;
LOGGER.info("Calling WriteUnsortedData to add elements");
LOGGER.trace("currentGraphPartitioner is {}", currentGraphPartitioner);
new WriteUnsortedData(store, currentGraphPartitioner, directoryForGroupAndPartitionId, directoryForGroupAndPartitionIdForReversedEdges).writeElements(addElementsOperation.getInput());
// For every group and partition, aggregate the new data with the old data and then sort
final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionId = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/group=" + group + "/partition=" + partitionId;
final BiFunction<String, Integer, String> directoryForSortedResultsForGroupAndPartitionIdForReversedEdges = (group, partitionId) -> tmpDirectory + "/sorted_new_old_merged" + "/REVERSED-group=" + group + "/partition=" + partitionId;
final List<Callable<CallableResult>> tasks = new ArrayList<>();
for (final String group : schema.getGroups()) {
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
for (final Partition partition : partitions) {
final List<String> inputFiles = new ArrayList<>();
// New data
inputFiles.add(directoryForGroupAndPartitionId.apply(group, partition.getPartitionId()));
// Old data
inputFiles.add(store.getFile(group, partition));
final String outputDir = directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId());
final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, group + "-" + partition.getPartitionId(), false, store.getProperties().getCompressionCodecName(), spark);
tasks.add(task);
LOGGER.info("Created AggregateAndSortData task for group {}, partition {}", group, partition.getPartitionId());
}
}
for (final String group : schema.getEdgeGroups()) {
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
for (final Partition partition : partitions) {
final List<String> inputFiles = new ArrayList<>();
// New data
inputFiles.add(directoryForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
// Old data
inputFiles.add(store.getFileForReversedEdges(group, partition));
final String outputDir = directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId());
final AggregateAndSortData task = new AggregateAndSortData(schemaUtils, fs, inputFiles, outputDir, group, "reversed-" + group + "-" + partition.getPartitionId(), true, store.getProperties().getCompressionCodecName(), spark);
tasks.add(task);
LOGGER.info("Created AggregateAndSortData task for reversed edge group {}, partition {}", group, partition.getPartitionId());
}
}
try {
LOGGER.info("Invoking {} AggregateAndSortData tasks", tasks.size());
final List<Future<CallableResult>> futures = threadPool.invokeAll(tasks);
for (final Future<CallableResult> future : futures) {
final CallableResult result = future.get();
LOGGER.info("Result {} from task", result);
}
} catch (final InterruptedException e) {
throw new OperationException("InterruptedException running AggregateAndSortData tasks", e);
} catch (final ExecutionException e) {
throw new OperationException("ExecutionException running AggregateAndSortData tasks", e);
}
try {
// Move results to a new snapshot directory (the -tmp at the end allows us to add data to the directory,
// and then when this is all finished we rename the directory to remove the -tmp; this allows us to make
// the replacement of the old data with the new data an atomic operation and ensures that a get operation
// against the store will not read the directory when only some of the data has been moved there).
final long snapshot = System.currentTimeMillis();
final String newDataDir = store.getDataDir() + "/" + ParquetStore.getSnapshotPath(snapshot) + "-tmp";
LOGGER.info("Moving aggregated and sorted data to new snapshot directory {}", newDataDir);
fs.mkdirs(new Path(newDataDir));
for (final String group : schema.getGroups()) {
final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, false));
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
for (final String group : schema.getEdgeGroups()) {
final Path groupDir = new Path(newDataDir, ParquetStore.getGroupSubDir(group, true));
fs.mkdirs(groupDir);
LOGGER.info("Created directory {}", groupDir);
}
for (final String group : schema.getGroups()) {
final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, false);
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitioner(group).getPartitions();
for (final Partition partition : partitions) {
final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionId.apply(group, partition.getPartitionId()));
if (!fs.exists(outputDir)) {
LOGGER.info("Not moving data for group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
} else {
// One .parquet file and one .parquet.crc file
final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
if (1 != status.length) {
LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
} else {
final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
fs.rename(status[0].getPath(), destination);
}
}
}
}
for (final String group : schema.getEdgeGroups()) {
final String groupDir = newDataDir + "/" + ParquetStore.getGroupSubDir(group, true);
final List<Partition> partitions = currentGraphPartitioner.getGroupPartitionerForReversedEdges(group).getPartitions();
for (final Partition partition : partitions) {
final Path outputDir = new Path(directoryForSortedResultsForGroupAndPartitionIdForReversedEdges.apply(group, partition.getPartitionId()));
if (!fs.exists(outputDir)) {
LOGGER.info("Not moving data for reversed edge group {}, partition id {} as the outputDir {} does not exist", group, partition.getPartitionId(), outputDir);
} else {
// One .parquet file and one .parquet.crc file
final FileStatus[] status = fs.listStatus(outputDir, path -> path.getName().endsWith(".parquet"));
if (1 != status.length) {
LOGGER.error("Didn't find one Parquet file in path {} (found {} files)", outputDir, status.length);
throw new OperationException("Expected to find one Parquet file in path " + outputDir + " (found " + status.length + " files)");
} else {
final Path destination = new Path(groupDir, ParquetStore.getFile(partition.getPartitionId()));
LOGGER.info("Renaming {} to {}", status[0].getPath(), destination);
fs.rename(status[0].getPath(), destination);
}
}
}
}
// Delete temporary data directory
LOGGER.info("Deleting temporary directory {}", tmpDirectory);
fs.delete(new Path(tmpDirectory), true);
// Write out graph partitioner (unchanged from previous one)
final Path newGraphPartitionerPath = new Path(newDataDir + "/graphPartitioner");
final FSDataOutputStream stream = fs.create(newGraphPartitionerPath);
LOGGER.info("Writing graph partitioner to {}", newGraphPartitionerPath);
new GraphPartitionerSerialiser().write(currentGraphPartitioner, stream);
stream.close();
// Move snapshot-tmp directory to snapshot
final String directoryWithoutTmp = newDataDir.substring(0, newDataDir.lastIndexOf("-tmp"));
LOGGER.info("Renaming {} to {}", newDataDir, directoryWithoutTmp);
fs.rename(new Path(newDataDir), new Path(directoryWithoutTmp));
// Set snapshot on store to new value
LOGGER.info("Updating latest snapshot on store to {}", snapshot);
store.setLatestSnapshot(snapshot);
} catch (final IOException | StoreException e) {
throw new OperationException("IOException moving results files into new snapshot directory", e);
}
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.
the class AddElementsFromRDD method calculateAndWritePartitioner.
/**
* Calculates the new graph partitioner and writes it to file.
*
* @throws OperationException if an {@link IOException} is thrown
*/
private void calculateAndWritePartitioner() throws OperationException {
// Create new graph partitioner
LOGGER.info("Calculating new GraphPartitioner");
final GraphPartitioner newPartitioner;
try {
newPartitioner = new CalculatePartitioner(new Path(getSortedAggregatedDirectory(true, true)), store.getSchema(), fs).call();
} catch (final IOException e) {
throw new OperationException("IOException calculating new graph partitioner", e);
}
LOGGER.info("New GraphPartitioner has partitions for {} groups, {} reversed edge groups", newPartitioner.getGroups().size(), newPartitioner.getGroupsForReversedEdges().size());
// Write out graph partitioner
Path newGraphPartitionerPath = null;
try {
newGraphPartitionerPath = new Path(getSortedAggregatedDirectory(true, true) + "graphPartitioner");
final FSDataOutputStream stream = fs.create(newGraphPartitionerPath);
LOGGER.info("Writing graph partitioner to {}", newGraphPartitionerPath);
new GraphPartitionerSerialiser().write(newPartitioner, stream);
stream.close();
} catch (final IOException e) {
throw new OperationException("IOException writing out graph partitioner to " + newGraphPartitionerPath, e);
}
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.
the class ParquetStore method loadGraphPartitioner.
private void loadGraphPartitioner() throws StoreException {
final String dataDir = getDataDir();
try {
if (fs.exists(new Path(dataDir))) {
this.currentSnapshot = getLatestSnapshot(dataDir);
LOGGER.info("Setting currentSnapshot to {}", this.currentSnapshot);
final Path path = getGraphPartitionerPath();
if (!fs.exists(path)) {
LOGGER.info("Graph partitioner does not exist in {} so creating it", path);
final GraphPartitioner partitioner = new CalculatePartitioner(new Path(dataDir + "/" + getSnapshotPath(this.currentSnapshot)), getSchema(), fs).call();
LOGGER.info("Writing graph partitioner to {}", path);
final FSDataOutputStream stream = fs.create(path);
new GraphPartitionerSerialiser().write(partitioner, stream);
stream.close();
}
LOGGER.info("Loading graph partitioner from path {}", path);
loadGraphPartitioner(path);
} else {
throw new StoreException("Data directory " + dataDir + " does not exist - store is in an inconsistent state");
}
} catch (final IOException e) {
throw new StoreException(e.getMessage(), e);
}
}
use of uk.gov.gchq.gaffer.parquetstore.partitioner.serialisation.GraphPartitionerSerialiser in project Gaffer by gchq.
the class ParquetStore method loadGraphPartitioner.
private void loadGraphPartitioner(final Path graphPartitionerPath) throws IOException {
final FSDataInputStream stream = fs.open(graphPartitionerPath);
this.graphPartitioner = new GraphPartitionerSerialiser().read(stream);
stream.close();
}
Aggregations