use of com.ibm.cohort.cql.spark.errors.EvaluationError in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluator method evaluate.
/**
* Evaluate the input CQL for a single context + data pair.
*
* @param rowsByContext In-memory data for all datatypes related to a single
* context
* @param contextName Name of the context used to select measure evaluations.
* @param resultsSchema StructType containing the schema data for the output table
* that will be created.
* @param evaluator configured CQLEvaluator (data provider, term provider,
* library provider all previously setup)
* @param requests CqlEvaluationRequests containing lists of libraries,
* expressions, and parameters to evaluate
* @param columnEncoder Encoder used to calculate output column names for
* evaluation results
* @param perContextAccum Spark accumulator that tracks each individual context
* evaluation
* @param errorAccum Spark accumulator that tracks CQL evaluation errors
* @param batchRunTime Single unified timestamp for all contexts
* @return Evaluation results for all expressions evaluated keyed by the context
* ID. Expression names are automatically namespaced according to the
* library name to avoid issues arising for expression names matching
* between libraries (e.g. LibraryName.ExpressionName).
*/
protected Iterator<Tuple2<Object, Row>> evaluate(Tuple2<Object, List<Row>> rowsByContext, String contextName, StructType resultsSchema, CqlEvaluator evaluator, CqlEvaluationRequests requests, SparkOutputColumnEncoder columnEncoder, LongAccumulator perContextAccum, CollectionAccumulator<EvaluationError> errorAccum, ZonedDateTime batchRunTime) {
perContextAccum.add(1);
List<CqlEvaluationRequest> requestsForContext = requests.getEvaluationsForContext(contextName);
// parameters json -> {columnName, result}
Map<String, Map<String, Object>> expressionResultsByParameters = new HashMap<>();
for (CqlEvaluationRequest request : requestsForContext) {
String parametersJson = encodedParametersCache.getKeyParametersColumnData(request);
Map<String, Object> expressionResults = expressionResultsByParameters.computeIfAbsent(parametersJson, x -> new HashMap<>());
for (CqlExpressionConfiguration expression : request.getExpressions()) {
CqlEvaluationRequest singleRequest = new CqlEvaluationRequest(request);
singleRequest.setExpressions(Collections.singleton(expression));
try {
CqlEvaluationResult result = evaluator.evaluate(singleRequest, args.debug ? CqlDebug.DEBUG : CqlDebug.NONE, batchRunTime);
for (Map.Entry<String, Object> entry : result.getExpressionResults().entrySet()) {
String outputColumnKey = columnEncoder.getColumnName(request, entry.getKey());
expressionResults.put(outputColumnKey, typeConverter.toSparkType(entry.getValue()));
}
} catch (Throwable th) {
if (args.haltOnError) {
throw new RuntimeException(String.format("CQL evaluation failed for ContextName: %s, OutputColumn: %s", String.valueOf(contextName), singleRequest.getExpressionNames()), th);
} else {
Object contextId = rowsByContext._1();
errorAccum.add(new EvaluationError(contextName, contextId, singleRequest.getExpressionNames().iterator().next(), th.getMessage()));
}
}
}
}
List<Tuple2<Object, Row>> rows = new ArrayList<>();
for (Map.Entry<String, Map<String, Object>> entry : expressionResultsByParameters.entrySet()) {
Object contextKey = rowsByContext._1();
Map<String, Object> results = entry.getValue();
Object[] data = new Object[resultsSchema.fields().length];
data[0] = contextKey;
data[1] = entry.getKey();
for (int i = 2; i < resultsSchema.fieldNames().length; i++) {
data[i] = results.get(resultsSchema.fieldNames()[i]);
}
rows.add(new Tuple2<Object, Row>(contextKey, RowFactory.create(data)));
}
return rows.iterator();
}
use of com.ibm.cohort.cql.spark.errors.EvaluationError in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluator method run.
public void run(PrintStream out) throws Exception {
EvaluationSummary evaluationSummary = new EvaluationSummary();
long startTimeMillis = System.currentTimeMillis();
evaluationSummary.setStartTimeMillis(startTimeMillis);
evaluationSummary.setJobStatus(JobStatus.FAIL);
SparkSession.Builder sparkBuilder = SparkSession.builder();
try (SparkSession spark = sparkBuilder.getOrCreate()) {
final LongAccumulator contextAccum = spark.sparkContext().longAccumulator("Context");
final CollectionAccumulator<EvaluationError> errorAccumulator = spark.sparkContext().collectionAccumulator("EvaluationErrors");
try {
spark.sparkContext().setLocalProperty("mdc." + CORRELATION_ID, MDC.get(CORRELATION_ID));
evaluationSummary.setCorrelationId(MDC.get(CORRELATION_ID));
boolean useJava8API = Boolean.valueOf(spark.conf().get("spark.sql.datetime.java8API.enabled"));
this.typeConverter = new SparkTypeConverter(useJava8API);
this.hadoopConfiguration = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration());
evaluationSummary.setApplicationId(spark.sparkContext().applicationId());
CqlToElmTranslator cqlTranslator = getCqlTranslator();
SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
ContextDefinitions contexts = readContextDefinitions(args.contextDefinitionPath);
List<ContextDefinition> filteredContexts = contexts.getContextDefinitions();
if (args.aggregationContexts != null && !args.aggregationContexts.isEmpty()) {
filteredContexts = filteredContexts.stream().filter(def -> args.aggregationContexts.contains(def.getName())).collect(Collectors.toList());
}
if (filteredContexts.isEmpty()) {
throw new IllegalArgumentException("At least one context definition is required (after filtering if enabled).");
}
Map<String, StructType> resultSchemas = calculateSparkSchema(filteredContexts.stream().map(ContextDefinition::getName).collect(Collectors.toList()), contexts, columnEncoder, cqlTranslator);
ZonedDateTime batchRunTime = ZonedDateTime.now();
final LongAccumulator perContextAccum = spark.sparkContext().longAccumulator("PerContext");
CustomMetricSparkPlugin.contextAccumGauge.setAccumulator(contextAccum);
CustomMetricSparkPlugin.perContextAccumGauge.setAccumulator(perContextAccum);
CustomMetricSparkPlugin.totalContextsToProcessCounter.inc(filteredContexts.size());
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
ColumnRuleCreator columnRuleCreator = new ColumnRuleCreator(getFilteredJobSpecificationWithIds().getEvaluations(), getCqlTranslator(), createLibraryProvider());
Map<String, String> dataTypeAliases = createDataTypeAliases(filteredContexts, cqlTranslator);
for (ContextDefinition context : filteredContexts) {
final String contextName = context.getName();
ContextRetriever contextRetriever = new ContextRetriever(args.inputPaths, new DefaultDatasetRetriever(spark, args.inputFormat), args.disableColumnFiltering ? null : columnRuleCreator.getDataRequirementsForContext(context));
StructType resultsSchema = resultSchemas.get(contextName);
if (resultsSchema == null || resultsSchema.fields().length == 0) {
LOG.warn("Context " + contextName + " has no defines configured. Skipping.");
} else {
LOG.info("Evaluating context " + contextName);
long contextStartMillis = System.currentTimeMillis();
final String outputPath = MapUtils.getRequiredKey(args.outputPaths, context.getName(), "outputPath");
JavaPairRDD<Object, List<Row>> rowsByContextId = contextRetriever.retrieveContext(context);
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.getValue() + 1);
JavaPairRDD<Object, Row> resultsByContext = rowsByContextId.flatMapToPair(x -> evaluate(contextName, resultsSchema, x, dataTypeAliases, perContextAccum, errorAccumulator, batchRunTime));
writeResults(spark, resultsSchema, resultsByContext, outputPath);
long contextEndMillis = System.currentTimeMillis();
LOG.info(String.format("Wrote results for context %s to %s", contextName, outputPath));
evaluationSummary.addContextCount(contextName, perContextAccum.value());
evaluationSummary.addContextRuntime(contextName, contextEndMillis - contextStartMillis);
contextAccum.add(1);
perContextAccum.reset();
}
}
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
try {
Boolean metricsEnabledStr = Boolean.valueOf(spark.conf().get("spark.ui.prometheus.enabled"));
if (metricsEnabledStr) {
LOG.info("Prometheus metrics enabled, sleeping for 7 seconds to finish gathering metrics");
// sleep for over 5 seconds because Prometheus only polls
// every 5 seconds. If spark finishes and goes away immediately after completing,
// Prometheus will never be able to poll for the final set of metrics for the spark-submit
// The default promtheus config map was changed from 2 minute scrape interval to 5 seconds for spark pods
Thread.sleep(7000);
} else {
LOG.info("Prometheus metrics not enabled");
}
} catch (NoSuchElementException e) {
LOG.info("spark.ui.prometheus.enabled is not set");
}
evaluationSummary.setJobStatus(JobStatus.SUCCESS);
} catch (Exception e) {
// If we experience an error that would make the program halt, capture the error
// and report it in the batch summary file
ByteArrayOutputStream errorDetailStream = new ByteArrayOutputStream();
try (PrintStream printStream = new PrintStream(errorDetailStream)) {
printStream.write(e.getMessage().getBytes());
printStream.write('\n');
if (e.getCause() != null) {
printStream.write(e.getCause().getMessage().getBytes());
printStream.write('\n');
}
e.printStackTrace(printStream);
evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError(null, null, null, errorDetailStream.toString())));
}
throw e;
} finally {
long endTimeMillis = System.currentTimeMillis();
evaluationSummary.setEndTimeMillis(endTimeMillis);
evaluationSummary.setRuntimeMillis(endTimeMillis - startTimeMillis);
if (args.metadataOutputPath != null) {
if (evaluationSummary.getErrorList() == null) {
evaluationSummary.setErrorList(errorAccumulator.value());
}
if (CollectionUtils.isNotEmpty(evaluationSummary.getErrorList())) {
evaluationSummary.setJobStatus(JobStatus.FAIL);
}
evaluationSummary.setTotalContexts(contextAccum.value());
OutputMetadataWriter writer = getOutputMetadataWriter();
writer.writeMetadata(evaluationSummary);
}
}
}
}
use of com.ibm.cohort.cql.spark.errors.EvaluationError in project quality-measure-and-cohort-service by Alvearie.
the class HadoopPathOutputMetadataWriterTest method testNoSuccessMarkerCreated.
@Test
public void testNoSuccessMarkerCreated() {
String outputPath = "target/output/metadata/testnosuccess";
Path metadataPath = new Path(outputPath);
EvaluationSummary evaluationSummary = new EvaluationSummary();
evaluationSummary.setApplicationId("id456");
evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError()));
HadoopPathOutputMetadataWriter writer = new HadoopPathOutputMetadataWriter(metadataPath, new Configuration());
writer.writeMetadata(evaluationSummary);
assertFalse(new File(outputPath, HadoopPathOutputMetadataWriter.SUCCESS_MARKER).exists());
assertTrue(new File(outputPath, HadoopPathOutputMetadataWriter.BATCH_SUMMARY_PREFIX + "id456").exists());
}
Aggregations