use of com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluator method getSparkOutputColumnEncoder.
public SparkOutputColumnEncoder getSparkOutputColumnEncoder() throws Exception {
SparkOutputColumnEncoder columnEncoder = sparkOutputColumnEncoder.get();
if (columnEncoder == null) {
columnEncoder = ConfigurableOutputColumnNameEncoder.create(getFilteredJobSpecificationWithIds(), encodedParametersCache, args.defaultOutputColumnDelimiter);
sparkOutputColumnEncoder.set(columnEncoder);
}
return columnEncoder;
}
use of com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluator method evaluate.
/**
* Evaluate the input CQL for a single context + data pair.
*
* @param libraryProvider Library provider providing CQL/ELM content
* @param termProvider Terminology provider providing terminology resources
* @param funProvider External function provider providing static CQL functions
* @param contextName Context name corresponding to the library context key
* currently under evaluation.
* @param resultsSchema StructType containing the schema data for the output table
* that will be created.
* @param rowsByContext Data for a single evaluation context
* @param dataTypeAliases Mapping of data type to abstract type
* @param perContextAccum Spark accumulator that tracks each individual context
* evaluation
* @param errorAccum Spark accumulator that tracks CQL evaluation errors
* @param batchRunTime Single unified timestamp for all contexts
* @return Evaluation results for all expressions evaluated keyed by the context
* ID. Expression names are automatically namespaced according to the
* library name to avoid issues arising for expression names matching
* between libraries (e.g. LibraryName.ExpressionName).
* @throws Exception on general failure including CQL library loading issues
*/
protected Iterator<Tuple2<Object, Row>> evaluate(CqlLibraryProvider libraryProvider, CqlTerminologyProvider termProvider, ExternalFunctionProvider funProvider, String contextName, StructType resultsSchema, Tuple2<Object, List<Row>> rowsByContext, Map<String, String> dataTypeAliases, LongAccumulator perContextAccum, CollectionAccumulator<EvaluationError> errorAccum, ZonedDateTime batchRunTime) throws Exception {
// Convert the Spark objects to the cohort Java model
List<DataRow> datarows = rowsByContext._2().stream().map(getDataRowFactory()).collect(Collectors.toList());
Map<String, List<Object>> dataByDataType = new HashMap<>();
for (DataRow datarow : datarows) {
String dataType = (String) datarow.getValue(ContextRetriever.SOURCE_FACT_IDX);
List<Object> mappedRows = dataByDataType.computeIfAbsent(dataType, x -> new ArrayList<>());
mappedRows.add(datarow);
if (dataTypeAliases.containsKey(dataType)) {
String mappedType = dataTypeAliases.get(dataType);
List<Object> aliasedRows = dataByDataType.computeIfAbsent(mappedType, x -> new ArrayList<>());
aliasedRows.add(datarow);
}
}
DataRowRetrieveProvider retrieveProvider = new DataRowRetrieveProvider(dataByDataType, termProvider);
CqlDataProvider dataProvider = new DataRowDataProvider(getDataRowClass(), retrieveProvider);
CqlEvaluator evaluator = new CqlEvaluator().setLibraryProvider(libraryProvider).setDataProvider(dataProvider).setTerminologyProvider(termProvider).setExternalFunctionProvider(funProvider);
CqlEvaluationRequests requests = getFilteredJobSpecificationWithIds();
SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
return evaluate(rowsByContext, contextName, resultsSchema, evaluator, requests, columnEncoder, perContextAccum, errorAccum, batchRunTime);
}
use of com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder in project quality-measure-and-cohort-service by Alvearie.
the class SparkCqlEvaluator method run.
public void run(PrintStream out) throws Exception {
EvaluationSummary evaluationSummary = new EvaluationSummary();
long startTimeMillis = System.currentTimeMillis();
evaluationSummary.setStartTimeMillis(startTimeMillis);
evaluationSummary.setJobStatus(JobStatus.FAIL);
SparkSession.Builder sparkBuilder = SparkSession.builder();
try (SparkSession spark = sparkBuilder.getOrCreate()) {
final LongAccumulator contextAccum = spark.sparkContext().longAccumulator("Context");
final CollectionAccumulator<EvaluationError> errorAccumulator = spark.sparkContext().collectionAccumulator("EvaluationErrors");
try {
spark.sparkContext().setLocalProperty("mdc." + CORRELATION_ID, MDC.get(CORRELATION_ID));
evaluationSummary.setCorrelationId(MDC.get(CORRELATION_ID));
boolean useJava8API = Boolean.valueOf(spark.conf().get("spark.sql.datetime.java8API.enabled"));
this.typeConverter = new SparkTypeConverter(useJava8API);
this.hadoopConfiguration = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration());
evaluationSummary.setApplicationId(spark.sparkContext().applicationId());
CqlToElmTranslator cqlTranslator = getCqlTranslator();
SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
ContextDefinitions contexts = readContextDefinitions(args.contextDefinitionPath);
List<ContextDefinition> filteredContexts = contexts.getContextDefinitions();
if (args.aggregationContexts != null && !args.aggregationContexts.isEmpty()) {
filteredContexts = filteredContexts.stream().filter(def -> args.aggregationContexts.contains(def.getName())).collect(Collectors.toList());
}
if (filteredContexts.isEmpty()) {
throw new IllegalArgumentException("At least one context definition is required (after filtering if enabled).");
}
Map<String, StructType> resultSchemas = calculateSparkSchema(filteredContexts.stream().map(ContextDefinition::getName).collect(Collectors.toList()), contexts, columnEncoder, cqlTranslator);
ZonedDateTime batchRunTime = ZonedDateTime.now();
final LongAccumulator perContextAccum = spark.sparkContext().longAccumulator("PerContext");
CustomMetricSparkPlugin.contextAccumGauge.setAccumulator(contextAccum);
CustomMetricSparkPlugin.perContextAccumGauge.setAccumulator(perContextAccum);
CustomMetricSparkPlugin.totalContextsToProcessCounter.inc(filteredContexts.size());
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
ColumnRuleCreator columnRuleCreator = new ColumnRuleCreator(getFilteredJobSpecificationWithIds().getEvaluations(), getCqlTranslator(), createLibraryProvider());
Map<String, String> dataTypeAliases = createDataTypeAliases(filteredContexts, cqlTranslator);
for (ContextDefinition context : filteredContexts) {
final String contextName = context.getName();
ContextRetriever contextRetriever = new ContextRetriever(args.inputPaths, new DefaultDatasetRetriever(spark, args.inputFormat), args.disableColumnFiltering ? null : columnRuleCreator.getDataRequirementsForContext(context));
StructType resultsSchema = resultSchemas.get(contextName);
if (resultsSchema == null || resultsSchema.fields().length == 0) {
LOG.warn("Context " + contextName + " has no defines configured. Skipping.");
} else {
LOG.info("Evaluating context " + contextName);
long contextStartMillis = System.currentTimeMillis();
final String outputPath = MapUtils.getRequiredKey(args.outputPaths, context.getName(), "outputPath");
JavaPairRDD<Object, List<Row>> rowsByContextId = contextRetriever.retrieveContext(context);
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.getValue() + 1);
JavaPairRDD<Object, Row> resultsByContext = rowsByContextId.flatMapToPair(x -> evaluate(contextName, resultsSchema, x, dataTypeAliases, perContextAccum, errorAccumulator, batchRunTime));
writeResults(spark, resultsSchema, resultsByContext, outputPath);
long contextEndMillis = System.currentTimeMillis();
LOG.info(String.format("Wrote results for context %s to %s", contextName, outputPath));
evaluationSummary.addContextCount(contextName, perContextAccum.value());
evaluationSummary.addContextRuntime(contextName, contextEndMillis - contextStartMillis);
contextAccum.add(1);
perContextAccum.reset();
}
}
CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
try {
Boolean metricsEnabledStr = Boolean.valueOf(spark.conf().get("spark.ui.prometheus.enabled"));
if (metricsEnabledStr) {
LOG.info("Prometheus metrics enabled, sleeping for 7 seconds to finish gathering metrics");
// sleep for over 5 seconds because Prometheus only polls
// every 5 seconds. If spark finishes and goes away immediately after completing,
// Prometheus will never be able to poll for the final set of metrics for the spark-submit
// The default promtheus config map was changed from 2 minute scrape interval to 5 seconds for spark pods
Thread.sleep(7000);
} else {
LOG.info("Prometheus metrics not enabled");
}
} catch (NoSuchElementException e) {
LOG.info("spark.ui.prometheus.enabled is not set");
}
evaluationSummary.setJobStatus(JobStatus.SUCCESS);
} catch (Exception e) {
// If we experience an error that would make the program halt, capture the error
// and report it in the batch summary file
ByteArrayOutputStream errorDetailStream = new ByteArrayOutputStream();
try (PrintStream printStream = new PrintStream(errorDetailStream)) {
printStream.write(e.getMessage().getBytes());
printStream.write('\n');
if (e.getCause() != null) {
printStream.write(e.getCause().getMessage().getBytes());
printStream.write('\n');
}
e.printStackTrace(printStream);
evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError(null, null, null, errorDetailStream.toString())));
}
throw e;
} finally {
long endTimeMillis = System.currentTimeMillis();
evaluationSummary.setEndTimeMillis(endTimeMillis);
evaluationSummary.setRuntimeMillis(endTimeMillis - startTimeMillis);
if (args.metadataOutputPath != null) {
if (evaluationSummary.getErrorList() == null) {
evaluationSummary.setErrorList(errorAccumulator.value());
}
if (CollectionUtils.isNotEmpty(evaluationSummary.getErrorList())) {
evaluationSummary.setJobStatus(JobStatus.FAIL);
}
evaluationSummary.setTotalContexts(contextAccum.value());
OutputMetadataWriter writer = getOutputMetadataWriter();
writer.writeMetadata(evaluationSummary);
}
}
}
}
Aggregations