use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluatorTest method testReadContextDefinitions.
@Test
public void testReadContextDefinitions() throws Exception {
evaluator.hadoopConfiguration = new SerializableConfiguration(SparkHadoopUtil.get().conf());
ContextDefinitions contextDefinitions = evaluator.readContextDefinitions("src/test/resources/alltypes/metadata/context-definitions.json");
assertNotNull(contextDefinitions);
assertEquals(5, contextDefinitions.getContextDefinitions().size());
assertEquals(3, contextDefinitions.getContextDefinitions().get(0).getRelationships().size());
}
use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluatorTest method testReadCqlJobsInvalid.
@Test
public void testReadCqlJobsInvalid() throws Exception {
evaluator.hadoopConfiguration = new SerializableConfiguration(SparkHadoopUtil.get().conf());
assertThrows(IllegalArgumentException.class, () -> evaluator.readJobSpecification("src/test/resources/invalid/cql-jobs-invalid-global.json"));
}
use of org.apache.spark.util.SerializableConfiguration in project hudi by apache.
the class ColumnStatsIndexHelper method buildColumnStatsTableFor.
/**
* Parse min/max statistics from Parquet footers for provided columns and composes column-stats
* index table in the following format with 3 statistics denominated for each
* linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
* column {@code A}:
*
* <pre>
* +---------------------------+------------+------------+-------------+
* | file | A_minValue | A_maxValue | A_num_nulls |
* +---------------------------+------------+------------+-------------+
* | one_base_file.parquet | 1 | 10 | 0 |
* | another_base_file.parquet | -10 | 0 | 5 |
* +---------------------------+------------+------------+-------------+
* </pre>
*
* NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
* does not support statistics for it.
*
* TODO leverage metadata table after RFC-27 lands
* @VisibleForTesting
*
* @param sparkSession encompassing Spark session
* @param baseFilesPaths list of base-files paths to be sourced for column-stats index
* @param orderedColumnSchemas target ordered columns
* @return Spark's {@link Dataset} holding an index table
*/
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
SparkContext sc = sparkSession.sparkContext();
JavaSparkContext jsc = new JavaSparkContext(sc);
SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
int numParallelism = (baseFilesPaths.size() / 3 + 1);
List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
try {
jsc.setJobDescription("Listing parquet column statistics");
colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
Iterable<String> iterable = () -> paths;
return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
}).collect();
} finally {
jsc.setJobDescription(previousJobDescription);
}
// Group column's metadata by file-paths of the files it belongs to
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
int colSize = fileColumnsMetadata.size();
if (colSize == 0) {
return null;
}
String filePath = fileColumnsMetadata.get(0).getFilePath();
List<Object> indexRow = new ArrayList<>();
// First columns of the Z-index's row is target file-path
indexRow.add(filePath);
// For each column
orderedColumnSchemas.forEach(colSchema -> {
String colName = colSchema.name();
HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
DataType colType = colSchema.dataType();
if (colMetadata == null || colType == null) {
throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
}
Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
// min
indexRow.add(minMaxValue.getLeft());
// max
indexRow.add(minMaxValue.getRight());
indexRow.add(colMetadata.getNullCount());
});
return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
}).filter(Objects::nonNull);
StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluator method run.
public void run(PrintStream out) throws Exception {
EvaluationSummary evaluationSummary = new EvaluationSummary();
long startTimeMillis = System.currentTimeMillis();
evaluationSummary.setStartTimeMillis(startTimeMillis);
evaluationSummary.setJobStatus(JobStatus.FAIL);
SparkSession.Builder sparkBuilder = SparkSession.builder();
try (SparkSession spark = sparkBuilder.getOrCreate()) {
final LongAccumulator contextAccum = spark.sparkContext().longAccumulator("Context");
final CollectionAccumulator<EvaluationError> errorAccumulator = spark.sparkContext().collectionAccumulator("EvaluationErrors");
try {
spark.sparkContext().setLocalProperty("mdc." + CORRELATION_ID, MDC.get(CORRELATION_ID));
evaluationSummary.setCorrelationId(MDC.get(CORRELATION_ID));
boolean useJava8API = Boolean.valueOf(spark.conf().get("spark.sql.datetime.java8API.enabled"));
this.typeConverter = new SparkTypeConverter(useJava8API);
this.hadoopConfiguration = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration());
evaluationSummary.setApplicationId(spark.sparkContext().applicationId());
CqlToElmTranslator cqlTranslator = getCqlTranslator();
SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
ContextDefinitions contexts = readContextDefinitions(args.contextDefinitionPath);
List<ContextDefinition> filteredContexts = contexts.getContextDefinitions();
if (args.aggregationContexts != null && !args.aggregationContexts.isEmpty()) {
filteredContexts = filteredContexts.stream().filter(def -> args.aggregationContexts.contains(def.getName())).collect(Collectors.toList());
}
if (filteredContexts.isEmpty()) {
throw new IllegalArgumentException("At least one context definition is required (after filtering if enabled).");
}
Map<String, StructType> resultSchemas = calculateSparkSchema(filteredContexts.stream().map(ContextDefinition::getName).collect(Collectors.toList()), contexts, columnEncoder, cqlTranslator);
ZonedDateTime batchRunTime = ZonedDateTime.now();
final LongAccumulator perContextAccum = spark.sparkContext().longAccumulator("PerContext");
CustomMetricSparkPlugin.contextAccumGauge.setAccumulator(contextAccum);
CustomMetricSparkPlugin.perContextAccumGauge.setAccumulator(perContextAccum);
CustomMetricSparkPlugin.totalContextsToProcessCounter.inc(filteredContexts.size());
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
ColumnRuleCreator columnRuleCreator = new ColumnRuleCreator(getFilteredJobSpecificationWithIds().getEvaluations(), getCqlTranslator(), createLibraryProvider());
Map<String, String> dataTypeAliases = createDataTypeAliases(filteredContexts, cqlTranslator);
for (ContextDefinition context : filteredContexts) {
final String contextName = context.getName();
ContextRetriever contextRetriever = new ContextRetriever(args.inputPaths, new DefaultDatasetRetriever(spark, args.inputFormat), args.disableColumnFiltering ? null : columnRuleCreator.getDataRequirementsForContext(context));
StructType resultsSchema = resultSchemas.get(contextName);
if (resultsSchema == null || resultsSchema.fields().length == 0) {
LOG.warn("Context " + contextName + " has no defines configured. Skipping.");
} else {
LOG.info("Evaluating context " + contextName);
long contextStartMillis = System.currentTimeMillis();
final String outputPath = MapUtils.getRequiredKey(args.outputPaths, context.getName(), "outputPath");
JavaPairRDD<Object, List<Row>> rowsByContextId = contextRetriever.retrieveContext(context);
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.getValue() + 1);
JavaPairRDD<Object, Row> resultsByContext = rowsByContextId.flatMapToPair(x -> evaluate(contextName, resultsSchema, x, dataTypeAliases, perContextAccum, errorAccumulator, batchRunTime));
writeResults(spark, resultsSchema, resultsByContext, outputPath);
long contextEndMillis = System.currentTimeMillis();
LOG.info(String.format("Wrote results for context %s to %s", contextName, outputPath));
evaluationSummary.addContextCount(contextName, perContextAccum.value());
evaluationSummary.addContextRuntime(contextName, contextEndMillis - contextStartMillis);
contextAccum.add(1);
perContextAccum.reset();
}
}
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
try {
Boolean metricsEnabledStr = Boolean.valueOf(spark.conf().get("spark.ui.prometheus.enabled"));
if (metricsEnabledStr) {
LOG.info("Prometheus metrics enabled, sleeping for 7 seconds to finish gathering metrics");
// sleep for over 5 seconds because Prometheus only polls
// every 5 seconds. If spark finishes and goes away immediately after completing,
// Prometheus will never be able to poll for the final set of metrics for the spark-submit
// The default promtheus config map was changed from 2 minute scrape interval to 5 seconds for spark pods
Thread.sleep(7000);
} else {
LOG.info("Prometheus metrics not enabled");
}
} catch (NoSuchElementException e) {
LOG.info("spark.ui.prometheus.enabled is not set");
}
evaluationSummary.setJobStatus(JobStatus.SUCCESS);
} catch (Exception e) {
// If we experience an error that would make the program halt, capture the error
// and report it in the batch summary file
ByteArrayOutputStream errorDetailStream = new ByteArrayOutputStream();
try (PrintStream printStream = new PrintStream(errorDetailStream)) {
printStream.write(e.getMessage().getBytes());
printStream.write('\n');
if (e.getCause() != null) {
printStream.write(e.getCause().getMessage().getBytes());
printStream.write('\n');
}
e.printStackTrace(printStream);
evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError(null, null, null, errorDetailStream.toString())));
}
throw e;
} finally {
long endTimeMillis = System.currentTimeMillis();
evaluationSummary.setEndTimeMillis(endTimeMillis);
evaluationSummary.setRuntimeMillis(endTimeMillis - startTimeMillis);
if (args.metadataOutputPath != null) {
if (evaluationSummary.getErrorList() == null) {
evaluationSummary.setErrorList(errorAccumulator.value());
}
if (CollectionUtils.isNotEmpty(evaluationSummary.getErrorList())) {
evaluationSummary.setJobStatus(JobStatus.FAIL);
}
evaluationSummary.setTotalContexts(contextAccum.value());
OutputMetadataWriter writer = getOutputMetadataWriter();
writer.writeMetadata(evaluationSummary);
}
}
}
}
use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluatorTest method testReadCqlJobsSuccess.
@Test
public void testReadCqlJobsSuccess() throws Exception {
IntervalParameter measurementPeriod = new IntervalParameter();
measurementPeriod.setStart(new DateParameter("2020-01-01")).setEnd(new DateParameter("2021-01-01"));
IntegerParameter minimumAge = new IntegerParameter(17);
evaluator.hadoopConfiguration = new SerializableConfiguration(SparkHadoopUtil.get().conf());
CqlEvaluationRequests requests = evaluator.readJobSpecification("src/test/resources/simple-job/cql-jobs.json");
assertNotNull(requests);
assertEquals(measurementPeriod, requests.getGlobalParameters().get("Measurement Period"));
assertEquals(1, requests.getEvaluations().size());
assertEquals(minimumAge, requests.getEvaluations().get(0).getParameters().get("MinimumAge"));
}
Aggregations