use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.
the class AddElementsFromRDD method aggregateNewAndOldData.
/**
* For each group that requires aggregation, this method aggregates the new data that has been written out to file
* with the existing data for that group.
*
* @throws OperationException if an {@link IOException} or a {@link SerialisationException} is thrown
*/
private void aggregateNewAndOldData() throws OperationException {
LOGGER.info("Creating AggregateDataForGroup tasks for groups that require aggregation");
for (final String group : schema.getAggregatedGroups()) {
LOGGER.info("Creating AggregateDataForGroup task for group {}", group);
final List<String> inputFiles = new ArrayList<>();
final String groupDirectoryNewData = getDirectory(group, false, false, false);
final FileStatus[] newData;
try {
newData = fs.listStatus(new Path(groupDirectoryNewData), path -> path.getName().endsWith(".parquet"));
} catch (final IOException e) {
throw new OperationException("IOException finding Parquet files in " + groupDirectoryNewData, e);
}
Arrays.stream(newData).map(f -> f.getPath().toString()).forEach(inputFiles::add);
final List<Path> existingData;
try {
existingData = store.getFilesForGroup(group);
} catch (final IOException e) {
throw new OperationException("IOException finding files for group " + group, e);
}
existingData.stream().map(Path::toString).forEach(inputFiles::add);
final String outputDir = getDirectory(group, false, true, false);
final AggregateDataForGroup aggregateDataForGroup;
try {
aggregateDataForGroup = new AggregateDataForGroup(fs, schemaUtils, group, inputFiles, outputDir, spark);
} catch (final SerialisationException e) {
throw new OperationException("SerialisationException creating AggregateDataForGroup task", e);
}
LOGGER.info("AggregateDataForGroup task for group {} is being called ({} files as input, outputting to {})", group, inputFiles.size(), outputDir);
aggregateDataForGroup.call();
}
}
use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.
the class CalculatePartitioner method call.
public GraphPartitioner call() throws IOException {
final SchemaUtils schemaUtils = new SchemaUtils(schema);
final GraphPartitioner graphPartitioner = new GraphPartitioner();
for (final String group : schema.getGroups()) {
LOGGER.info("Calculating GroupPartitioner for group {}", group);
final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
final List<PartitionKey> partitionKeys = new ArrayList<>();
final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, false));
final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
final SortedSet<Path> sortedFiles = new TreeSet<>();
Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
LOGGER.debug("Found {} files in {}", files.length, groupPath);
for (int i = 1; i < sortedPaths.length; i++) {
// NB Skip first file
LOGGER.debug("Reading first line of {}", sortedPaths[i]);
final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(schema.getEntityGroups().contains(group)).usingConverter(converter).build();
// NB Should never be null as empty files are removed before this is called
final Element element = reader.read();
if (null == element) {
throw new IOException("No first element in file " + files[i].getPath() + " - empty files are supposed to be removed");
}
reader.close();
final Object[] parquetObjects = converter.corePropertiesToParquetObjects(element);
final PartitionKey key = new PartitionKey(parquetObjects);
partitionKeys.add(key);
}
final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
graphPartitioner.addGroupPartitioner(group, groupPartitioner);
LOGGER.info("GroupPartitioner for group {} is {}", group, groupPartitioner);
}
for (final String group : schema.getEdgeGroups()) {
LOGGER.info("Calculating GroupPartitioner for reversed edge group {}", group);
final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
final List<PartitionKey> partitionKeys = new ArrayList<>();
final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, true));
final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
final SortedSet<Path> sortedFiles = new TreeSet<>();
Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
LOGGER.debug("Found {} files in {}", files.length, groupPath);
for (int i = 1; i < sortedPaths.length; i++) {
// NB Skip first file
LOGGER.debug("Reading first line of {}", sortedPaths[i]);
final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(false).usingConverter(converter).build();
final Edge edge = (Edge) reader.read();
if (null == edge) {
throw new IOException("No first edge in file " + files[i].getPath() + " - empty files are supposed to be removed");
}
reader.close();
final Object[] parquetObjects = converter.corePropertiesToParquetObjectsForReversedEdge(edge);
final PartitionKey key = new PartitionKey(parquetObjects);
partitionKeys.add(key);
}
final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
graphPartitioner.addGroupPartitionerForReversedEdges(group, groupPartitioner);
}
return graphPartitioner;
}
use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.
the class RetrieveElementsFromFile method call.
@Override
public OperationException call() throws Exception {
if (null == elementFilter) {
elementFilter = new ViewElementDefinition.Builder().json(elementDefinitionJson).build().getPreAggregationFilter();
}
if (null == schemaUtils) {
schemaUtils = new SchemaUtils(Schema.fromJson(jsonGafferSchema));
}
try {
final ParquetReader<Element> fileReader = openParquetReader();
Element e = fileReader.read();
while (null != e) {
if (!visibility.isEmpty()) {
if (isVisible(e)) {
if (needsValidatorsAndFiltersApplying) {
final String group = e.getGroup();
final ElementFilter validatorFilter = gafferSchema.getElement(group).getValidator(false);
if (skipValidation || validatorFilter == null || validatorFilter.test(e)) {
if (elementFilter == null || elementFilter.test(e)) {
ViewUtil.removeProperties(view, e);
queue.add(e);
}
}
} else {
ViewUtil.removeProperties(view, e);
queue.add(e);
}
}
} else if (needsValidatorsAndFiltersApplying) {
final String group = e.getGroup();
final ElementFilter validatorFilter = gafferSchema.getElement(group).getValidator(false);
if (skipValidation || validatorFilter == null || validatorFilter.test(e)) {
if (elementFilter == null || elementFilter.test(e)) {
ViewUtil.removeProperties(view, e);
queue.add(e);
}
}
} else {
ViewUtil.removeProperties(view, e);
queue.add(e);
}
e = fileReader.read();
}
fileReader.close();
} catch (final IOException ignore) {
LOGGER.error("IOException reading file", ignore);
// ignore as this file does not exist
}
return null;
}
use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.
the class GetDataFrameOfElementsHandler method doOperation.
private Dataset<Row> doOperation(final GetDataFrameOfElements operation, final ParquetStore store, final SparkSession spark) throws OperationException {
if (!operation.getView().equals(new View.Builder().entities(store.getSchema().getEntityGroups()).edges(store.getSchema().getEdgeGroups()).build())) {
throw new OperationException("This operation does not currently support views");
}
LOGGER.debug("Creating a Dataset<Row> from path {} with option mergeSchema=true", store.getGraphPath());
final StructType schema = new SchemaUtils(store.getSchema()).getMergedSparkSchema(store.getSchema().getGroups());
final Dataset<Row> dataframe = spark.read().schema(schema).parquet(store.getGraphPath());
return dataframe;
}
use of uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils in project Gaffer by gchq.
the class WriteData method call.
// Not private to allow tests to call it
void call(final Iterator<Element> elements, final int partitionId, final long taskAttemptId) throws Exception {
final SchemaUtils schemaUtils = new SchemaUtils(Schema.fromJson(schemaAsJson));
final Map<String, ParquetWriter<Element>> groupToWriter = new HashMap<>();
final Map<String, Path> groupToWriterPath = new HashMap<>();
for (final String group : schemaUtils.getGroups()) {
groupToWriterPath.put(group, new Path(groupToDirectory.get(group) + "/input-" + partitionId + "-" + taskAttemptId + ".parquet"));
groupToWriter.put(group, buildWriter(group, groupToWriterPath.get(group), schemaUtils));
}
writeData(elements, partitionId, taskAttemptId, groupToWriter);
renameFiles(partitionId, taskAttemptId, schemaUtils.getGroups(), groupToWriterPath);
}
Aggregations