Search in sources :

Example 1 with EvaluationError

use of com.ibm.cohort.cql.spark.errors.EvaluationError in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluator method evaluate.

/**
 * Evaluate the input CQL for a single context + data pair.
 *
 * @param rowsByContext   In-memory data for all datatypes related to a single
 *                        context
 * @param contextName     Name of the context used to select measure evaluations.
 * @param resultsSchema   StructType containing the schema data for the output table
 *                        that will be created.
 * @param evaluator       configured CQLEvaluator (data provider, term provider,
 *                        library provider all previously setup)
 * @param requests        CqlEvaluationRequests containing lists of libraries,
 *                        expressions, and parameters to evaluate
 * @param columnEncoder   Encoder used to calculate output column names for
 *                        evaluation results
 * @param perContextAccum Spark accumulator that tracks each individual context
 *                        evaluation
 * @param errorAccum       Spark accumulator that tracks CQL evaluation errors
 * @param batchRunTime    Single unified timestamp for all contexts
 * @return Evaluation results for all expressions evaluated keyed by the context
 *         ID. Expression names are automatically namespaced according to the
 *         library name to avoid issues arising for expression names matching
 *         between libraries (e.g. LibraryName.ExpressionName).
 */
protected Iterator<Tuple2<Object, Row>> evaluate(Tuple2<Object, List<Row>> rowsByContext, String contextName, StructType resultsSchema, CqlEvaluator evaluator, CqlEvaluationRequests requests, SparkOutputColumnEncoder columnEncoder, LongAccumulator perContextAccum, CollectionAccumulator<EvaluationError> errorAccum, ZonedDateTime batchRunTime) {
    perContextAccum.add(1);
    List<CqlEvaluationRequest> requestsForContext = requests.getEvaluationsForContext(contextName);
    // parameters json -> {columnName, result}
    Map<String, Map<String, Object>> expressionResultsByParameters = new HashMap<>();
    for (CqlEvaluationRequest request : requestsForContext) {
        String parametersJson = encodedParametersCache.getKeyParametersColumnData(request);
        Map<String, Object> expressionResults = expressionResultsByParameters.computeIfAbsent(parametersJson, x -> new HashMap<>());
        for (CqlExpressionConfiguration expression : request.getExpressions()) {
            CqlEvaluationRequest singleRequest = new CqlEvaluationRequest(request);
            singleRequest.setExpressions(Collections.singleton(expression));
            try {
                CqlEvaluationResult result = evaluator.evaluate(singleRequest, args.debug ? CqlDebug.DEBUG : CqlDebug.NONE, batchRunTime);
                for (Map.Entry<String, Object> entry : result.getExpressionResults().entrySet()) {
                    String outputColumnKey = columnEncoder.getColumnName(request, entry.getKey());
                    expressionResults.put(outputColumnKey, typeConverter.toSparkType(entry.getValue()));
                }
            } catch (Throwable th) {
                if (args.haltOnError) {
                    throw new RuntimeException(String.format("CQL evaluation failed for ContextName: %s, OutputColumn: %s", String.valueOf(contextName), singleRequest.getExpressionNames()), th);
                } else {
                    Object contextId = rowsByContext._1();
                    errorAccum.add(new EvaluationError(contextName, contextId, singleRequest.getExpressionNames().iterator().next(), th.getMessage()));
                }
            }
        }
    }
    List<Tuple2<Object, Row>> rows = new ArrayList<>();
    for (Map.Entry<String, Map<String, Object>> entry : expressionResultsByParameters.entrySet()) {
        Object contextKey = rowsByContext._1();
        Map<String, Object> results = entry.getValue();
        Object[] data = new Object[resultsSchema.fields().length];
        data[0] = contextKey;
        data[1] = entry.getKey();
        for (int i = 2; i < resultsSchema.fieldNames().length; i++) {
            data[i] = results.get(resultsSchema.fieldNames()[i]);
        }
        rows.add(new Tuple2<Object, Row>(contextKey, RowFactory.create(data)));
    }
    return rows.iterator();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CqlEvaluationRequest(com.ibm.cohort.cql.evaluation.CqlEvaluationRequest) CqlEvaluationResult(com.ibm.cohort.cql.evaluation.CqlEvaluationResult) CqlExpressionConfiguration(com.ibm.cohort.cql.evaluation.CqlExpressionConfiguration) Tuple2(scala.Tuple2) EvaluationError(com.ibm.cohort.cql.spark.errors.EvaluationError) SparkDataRow(com.ibm.cohort.cql.spark.data.SparkDataRow) DataRow(com.ibm.cohort.datarow.model.DataRow) Row(org.apache.spark.sql.Row) Map(java.util.Map) HashMap(java.util.HashMap)

Example 2 with EvaluationError

use of com.ibm.cohort.cql.spark.errors.EvaluationError in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluator method run.

public void run(PrintStream out) throws Exception {
    EvaluationSummary evaluationSummary = new EvaluationSummary();
    long startTimeMillis = System.currentTimeMillis();
    evaluationSummary.setStartTimeMillis(startTimeMillis);
    evaluationSummary.setJobStatus(JobStatus.FAIL);
    SparkSession.Builder sparkBuilder = SparkSession.builder();
    try (SparkSession spark = sparkBuilder.getOrCreate()) {
        final LongAccumulator contextAccum = spark.sparkContext().longAccumulator("Context");
        final CollectionAccumulator<EvaluationError> errorAccumulator = spark.sparkContext().collectionAccumulator("EvaluationErrors");
        try {
            spark.sparkContext().setLocalProperty("mdc." + CORRELATION_ID, MDC.get(CORRELATION_ID));
            evaluationSummary.setCorrelationId(MDC.get(CORRELATION_ID));
            boolean useJava8API = Boolean.valueOf(spark.conf().get("spark.sql.datetime.java8API.enabled"));
            this.typeConverter = new SparkTypeConverter(useJava8API);
            this.hadoopConfiguration = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration());
            evaluationSummary.setApplicationId(spark.sparkContext().applicationId());
            CqlToElmTranslator cqlTranslator = getCqlTranslator();
            SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
            ContextDefinitions contexts = readContextDefinitions(args.contextDefinitionPath);
            List<ContextDefinition> filteredContexts = contexts.getContextDefinitions();
            if (args.aggregationContexts != null && !args.aggregationContexts.isEmpty()) {
                filteredContexts = filteredContexts.stream().filter(def -> args.aggregationContexts.contains(def.getName())).collect(Collectors.toList());
            }
            if (filteredContexts.isEmpty()) {
                throw new IllegalArgumentException("At least one context definition is required (after filtering if enabled).");
            }
            Map<String, StructType> resultSchemas = calculateSparkSchema(filteredContexts.stream().map(ContextDefinition::getName).collect(Collectors.toList()), contexts, columnEncoder, cqlTranslator);
            ZonedDateTime batchRunTime = ZonedDateTime.now();
            final LongAccumulator perContextAccum = spark.sparkContext().longAccumulator("PerContext");
            CustomMetricSparkPlugin.contextAccumGauge.setAccumulator(contextAccum);
            CustomMetricSparkPlugin.perContextAccumGauge.setAccumulator(perContextAccum);
            CustomMetricSparkPlugin.totalContextsToProcessCounter.inc(filteredContexts.size());
            CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
            ColumnRuleCreator columnRuleCreator = new ColumnRuleCreator(getFilteredJobSpecificationWithIds().getEvaluations(), getCqlTranslator(), createLibraryProvider());
            Map<String, String> dataTypeAliases = createDataTypeAliases(filteredContexts, cqlTranslator);
            for (ContextDefinition context : filteredContexts) {
                final String contextName = context.getName();
                ContextRetriever contextRetriever = new ContextRetriever(args.inputPaths, new DefaultDatasetRetriever(spark, args.inputFormat), args.disableColumnFiltering ? null : columnRuleCreator.getDataRequirementsForContext(context));
                StructType resultsSchema = resultSchemas.get(contextName);
                if (resultsSchema == null || resultsSchema.fields().length == 0) {
                    LOG.warn("Context " + contextName + " has no defines configured. Skipping.");
                } else {
                    LOG.info("Evaluating context " + contextName);
                    long contextStartMillis = System.currentTimeMillis();
                    final String outputPath = MapUtils.getRequiredKey(args.outputPaths, context.getName(), "outputPath");
                    JavaPairRDD<Object, List<Row>> rowsByContextId = contextRetriever.retrieveContext(context);
                    CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.getValue() + 1);
                    JavaPairRDD<Object, Row> resultsByContext = rowsByContextId.flatMapToPair(x -> evaluate(contextName, resultsSchema, x, dataTypeAliases, perContextAccum, errorAccumulator, batchRunTime));
                    writeResults(spark, resultsSchema, resultsByContext, outputPath);
                    long contextEndMillis = System.currentTimeMillis();
                    LOG.info(String.format("Wrote results for context %s to %s", contextName, outputPath));
                    evaluationSummary.addContextCount(contextName, perContextAccum.value());
                    evaluationSummary.addContextRuntime(contextName, contextEndMillis - contextStartMillis);
                    contextAccum.add(1);
                    perContextAccum.reset();
                }
            }
            CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
            try {
                Boolean metricsEnabledStr = Boolean.valueOf(spark.conf().get("spark.ui.prometheus.enabled"));
                if (metricsEnabledStr) {
                    LOG.info("Prometheus metrics enabled, sleeping for 7 seconds to finish gathering metrics");
                    // sleep for over 5 seconds because Prometheus only polls
                    // every 5 seconds. If spark finishes and goes away immediately after completing,
                    // Prometheus will never be able to poll for the final set of metrics for the spark-submit
                    // The default promtheus config map was changed from 2 minute scrape interval to 5 seconds for spark pods
                    Thread.sleep(7000);
                } else {
                    LOG.info("Prometheus metrics not enabled");
                }
            } catch (NoSuchElementException e) {
                LOG.info("spark.ui.prometheus.enabled is not set");
            }
            evaluationSummary.setJobStatus(JobStatus.SUCCESS);
        } catch (Exception e) {
            // If we experience an error that would make the program halt, capture the error
            // and report it in the batch summary file
            ByteArrayOutputStream errorDetailStream = new ByteArrayOutputStream();
            try (PrintStream printStream = new PrintStream(errorDetailStream)) {
                printStream.write(e.getMessage().getBytes());
                printStream.write('\n');
                if (e.getCause() != null) {
                    printStream.write(e.getCause().getMessage().getBytes());
                    printStream.write('\n');
                }
                e.printStackTrace(printStream);
                evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError(null, null, null, errorDetailStream.toString())));
            }
            throw e;
        } finally {
            long endTimeMillis = System.currentTimeMillis();
            evaluationSummary.setEndTimeMillis(endTimeMillis);
            evaluationSummary.setRuntimeMillis(endTimeMillis - startTimeMillis);
            if (args.metadataOutputPath != null) {
                if (evaluationSummary.getErrorList() == null) {
                    evaluationSummary.setErrorList(errorAccumulator.value());
                }
                if (CollectionUtils.isNotEmpty(evaluationSummary.getErrorList())) {
                    evaluationSummary.setJobStatus(JobStatus.FAIL);
                }
                evaluationSummary.setTotalContexts(contextAccum.value());
                OutputMetadataWriter writer = getOutputMetadataWriter();
                writer.writeMetadata(evaluationSummary);
            }
        }
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ContextRetriever(com.ibm.cohort.cql.spark.aggregation.ContextRetriever) CqlToElmTranslator(com.ibm.cohort.cql.translation.CqlToElmTranslator) ContextDefinitions(com.ibm.cohort.cql.spark.aggregation.ContextDefinitions) ZonedDateTime(java.time.ZonedDateTime) List(java.util.List) ArrayList(java.util.ArrayList) DefaultDatasetRetriever(com.ibm.cohort.cql.spark.data.DefaultDatasetRetriever) PrintStream(java.io.PrintStream) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) SparkTypeConverter(com.ibm.cohort.cql.spark.data.SparkTypeConverter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ContextDefinition(com.ibm.cohort.cql.spark.aggregation.ContextDefinition) FileNotFoundException(java.io.FileNotFoundException) NoSuchElementException(java.util.NoSuchElementException) IOException(java.io.IOException) LongAccumulator(org.apache.spark.util.LongAccumulator) EvaluationSummary(com.ibm.cohort.cql.spark.metadata.EvaluationSummary) HadoopPathOutputMetadataWriter(com.ibm.cohort.cql.spark.metadata.HadoopPathOutputMetadataWriter) OutputMetadataWriter(com.ibm.cohort.cql.spark.metadata.OutputMetadataWriter) ColumnRuleCreator(com.ibm.cohort.cql.spark.aggregation.ColumnRuleCreator) SparkOutputColumnEncoder(com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder) EvaluationError(com.ibm.cohort.cql.spark.errors.EvaluationError) SparkDataRow(com.ibm.cohort.cql.spark.data.SparkDataRow) DataRow(com.ibm.cohort.datarow.model.DataRow) Row(org.apache.spark.sql.Row) NoSuchElementException(java.util.NoSuchElementException)

Example 3 with EvaluationError

use of com.ibm.cohort.cql.spark.errors.EvaluationError in project quality-measure-and-cohort-service by Alvearie.

the class HadoopPathOutputMetadataWriterTest method testNoSuccessMarkerCreated.

@Test
public void testNoSuccessMarkerCreated() {
    String outputPath = "target/output/metadata/testnosuccess";
    Path metadataPath = new Path(outputPath);
    EvaluationSummary evaluationSummary = new EvaluationSummary();
    evaluationSummary.setApplicationId("id456");
    evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError()));
    HadoopPathOutputMetadataWriter writer = new HadoopPathOutputMetadataWriter(metadataPath, new Configuration());
    writer.writeMetadata(evaluationSummary);
    assertFalse(new File(outputPath, HadoopPathOutputMetadataWriter.SUCCESS_MARKER).exists());
    assertTrue(new File(outputPath, HadoopPathOutputMetadataWriter.BATCH_SUMMARY_PREFIX + "id456").exists());
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) EvaluationError(com.ibm.cohort.cql.spark.errors.EvaluationError) File(java.io.File) Test(org.junit.Test)

Aggregations

EvaluationError (com.ibm.cohort.cql.spark.errors.EvaluationError)3 SparkDataRow (com.ibm.cohort.cql.spark.data.SparkDataRow)2 DataRow (com.ibm.cohort.datarow.model.DataRow)2 ArrayList (java.util.ArrayList)2 Row (org.apache.spark.sql.Row)2 CqlEvaluationRequest (com.ibm.cohort.cql.evaluation.CqlEvaluationRequest)1 CqlEvaluationResult (com.ibm.cohort.cql.evaluation.CqlEvaluationResult)1 CqlExpressionConfiguration (com.ibm.cohort.cql.evaluation.CqlExpressionConfiguration)1 ColumnRuleCreator (com.ibm.cohort.cql.spark.aggregation.ColumnRuleCreator)1 ContextDefinition (com.ibm.cohort.cql.spark.aggregation.ContextDefinition)1 ContextDefinitions (com.ibm.cohort.cql.spark.aggregation.ContextDefinitions)1 ContextRetriever (com.ibm.cohort.cql.spark.aggregation.ContextRetriever)1 DefaultDatasetRetriever (com.ibm.cohort.cql.spark.data.DefaultDatasetRetriever)1 SparkOutputColumnEncoder (com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder)1 SparkTypeConverter (com.ibm.cohort.cql.spark.data.SparkTypeConverter)1 EvaluationSummary (com.ibm.cohort.cql.spark.metadata.EvaluationSummary)1 HadoopPathOutputMetadataWriter (com.ibm.cohort.cql.spark.metadata.HadoopPathOutputMetadataWriter)1 OutputMetadataWriter (com.ibm.cohort.cql.spark.metadata.OutputMetadataWriter)1 CqlToElmTranslator (com.ibm.cohort.cql.translation.CqlToElmTranslator)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1