Search in sources :

Example 1 with SparkOutputColumnEncoder

use of com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluator method getSparkOutputColumnEncoder.

public SparkOutputColumnEncoder getSparkOutputColumnEncoder() throws Exception {
    SparkOutputColumnEncoder columnEncoder = sparkOutputColumnEncoder.get();
    if (columnEncoder == null) {
        columnEncoder = ConfigurableOutputColumnNameEncoder.create(getFilteredJobSpecificationWithIds(), encodedParametersCache, args.defaultOutputColumnDelimiter);
        sparkOutputColumnEncoder.set(columnEncoder);
    }
    return columnEncoder;
}
Also used : SparkOutputColumnEncoder(com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder)

Example 2 with SparkOutputColumnEncoder

use of com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluator method evaluate.

/**
 * Evaluate the input CQL for a single context + data pair.
 *
 * @param libraryProvider Library provider providing CQL/ELM content
 * @param termProvider    Terminology provider providing terminology resources
 * @param funProvider     External function provider providing static CQL functions
 * @param contextName     Context name corresponding to the library context key
 *                        currently under evaluation.
 * @param resultsSchema   StructType containing the schema data for the output table
 *                        that will be created.
 * @param rowsByContext   Data for a single evaluation context
 * @param dataTypeAliases Mapping of data type to abstract type
 * @param perContextAccum Spark accumulator that tracks each individual context
 *                        evaluation
 * @param errorAccum      Spark accumulator that tracks CQL evaluation errors
 * @param batchRunTime    Single unified timestamp for all contexts
 * @return Evaluation results for all expressions evaluated keyed by the context
 *         ID. Expression names are automatically namespaced according to the
 *         library name to avoid issues arising for expression names matching
 *         between libraries (e.g. LibraryName.ExpressionName).
 * @throws Exception on general failure including CQL library loading issues
 */
protected Iterator<Tuple2<Object, Row>> evaluate(CqlLibraryProvider libraryProvider, CqlTerminologyProvider termProvider, ExternalFunctionProvider funProvider, String contextName, StructType resultsSchema, Tuple2<Object, List<Row>> rowsByContext, Map<String, String> dataTypeAliases, LongAccumulator perContextAccum, CollectionAccumulator<EvaluationError> errorAccum, ZonedDateTime batchRunTime) throws Exception {
    // Convert the Spark objects to the cohort Java model
    List<DataRow> datarows = rowsByContext._2().stream().map(getDataRowFactory()).collect(Collectors.toList());
    Map<String, List<Object>> dataByDataType = new HashMap<>();
    for (DataRow datarow : datarows) {
        String dataType = (String) datarow.getValue(ContextRetriever.SOURCE_FACT_IDX);
        List<Object> mappedRows = dataByDataType.computeIfAbsent(dataType, x -> new ArrayList<>());
        mappedRows.add(datarow);
        if (dataTypeAliases.containsKey(dataType)) {
            String mappedType = dataTypeAliases.get(dataType);
            List<Object> aliasedRows = dataByDataType.computeIfAbsent(mappedType, x -> new ArrayList<>());
            aliasedRows.add(datarow);
        }
    }
    DataRowRetrieveProvider retrieveProvider = new DataRowRetrieveProvider(dataByDataType, termProvider);
    CqlDataProvider dataProvider = new DataRowDataProvider(getDataRowClass(), retrieveProvider);
    CqlEvaluator evaluator = new CqlEvaluator().setLibraryProvider(libraryProvider).setDataProvider(dataProvider).setTerminologyProvider(termProvider).setExternalFunctionProvider(funProvider);
    CqlEvaluationRequests requests = getFilteredJobSpecificationWithIds();
    SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
    return evaluate(rowsByContext, contextName, resultsSchema, evaluator, requests, columnEncoder, perContextAccum, errorAccum, batchRunTime);
}
Also used : HashMap(java.util.HashMap) DataRowDataProvider(com.ibm.cohort.datarow.engine.DataRowDataProvider) DataRowRetrieveProvider(com.ibm.cohort.datarow.engine.DataRowRetrieveProvider) SparkDataRow(com.ibm.cohort.cql.spark.data.SparkDataRow) DataRow(com.ibm.cohort.datarow.model.DataRow) SparkOutputColumnEncoder(com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder) List(java.util.List) ArrayList(java.util.ArrayList) CqlEvaluationRequests(com.ibm.cohort.cql.evaluation.CqlEvaluationRequests) CqlDataProvider(com.ibm.cohort.cql.data.CqlDataProvider) CqlEvaluator(com.ibm.cohort.cql.evaluation.CqlEvaluator)

Example 3 with SparkOutputColumnEncoder

use of com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluator method run.

public void run(PrintStream out) throws Exception {
    EvaluationSummary evaluationSummary = new EvaluationSummary();
    long startTimeMillis = System.currentTimeMillis();
    evaluationSummary.setStartTimeMillis(startTimeMillis);
    evaluationSummary.setJobStatus(JobStatus.FAIL);
    SparkSession.Builder sparkBuilder = SparkSession.builder();
    try (SparkSession spark = sparkBuilder.getOrCreate()) {
        final LongAccumulator contextAccum = spark.sparkContext().longAccumulator("Context");
        final CollectionAccumulator<EvaluationError> errorAccumulator = spark.sparkContext().collectionAccumulator("EvaluationErrors");
        try {
            spark.sparkContext().setLocalProperty("mdc." + CORRELATION_ID, MDC.get(CORRELATION_ID));
            evaluationSummary.setCorrelationId(MDC.get(CORRELATION_ID));
            boolean useJava8API = Boolean.valueOf(spark.conf().get("spark.sql.datetime.java8API.enabled"));
            this.typeConverter = new SparkTypeConverter(useJava8API);
            this.hadoopConfiguration = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration());
            evaluationSummary.setApplicationId(spark.sparkContext().applicationId());
            CqlToElmTranslator cqlTranslator = getCqlTranslator();
            SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
            ContextDefinitions contexts = readContextDefinitions(args.contextDefinitionPath);
            List<ContextDefinition> filteredContexts = contexts.getContextDefinitions();
            if (args.aggregationContexts != null && !args.aggregationContexts.isEmpty()) {
                filteredContexts = filteredContexts.stream().filter(def -> args.aggregationContexts.contains(def.getName())).collect(Collectors.toList());
            }
            if (filteredContexts.isEmpty()) {
                throw new IllegalArgumentException("At least one context definition is required (after filtering if enabled).");
            }
            Map<String, StructType> resultSchemas = calculateSparkSchema(filteredContexts.stream().map(ContextDefinition::getName).collect(Collectors.toList()), contexts, columnEncoder, cqlTranslator);
            ZonedDateTime batchRunTime = ZonedDateTime.now();
            final LongAccumulator perContextAccum = spark.sparkContext().longAccumulator("PerContext");
            CustomMetricSparkPlugin.contextAccumGauge.setAccumulator(contextAccum);
            CustomMetricSparkPlugin.perContextAccumGauge.setAccumulator(perContextAccum);
            CustomMetricSparkPlugin.totalContextsToProcessCounter.inc(filteredContexts.size());
            CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
            ColumnRuleCreator columnRuleCreator = new ColumnRuleCreator(getFilteredJobSpecificationWithIds().getEvaluations(), getCqlTranslator(), createLibraryProvider());
            Map<String, String> dataTypeAliases = createDataTypeAliases(filteredContexts, cqlTranslator);
            for (ContextDefinition context : filteredContexts) {
                final String contextName = context.getName();
                ContextRetriever contextRetriever = new ContextRetriever(args.inputPaths, new DefaultDatasetRetriever(spark, args.inputFormat), args.disableColumnFiltering ? null : columnRuleCreator.getDataRequirementsForContext(context));
                StructType resultsSchema = resultSchemas.get(contextName);
                if (resultsSchema == null || resultsSchema.fields().length == 0) {
                    LOG.warn("Context " + contextName + " has no defines configured. Skipping.");
                } else {
                    LOG.info("Evaluating context " + contextName);
                    long contextStartMillis = System.currentTimeMillis();
                    final String outputPath = MapUtils.getRequiredKey(args.outputPaths, context.getName(), "outputPath");
                    JavaPairRDD<Object, List<Row>> rowsByContextId = contextRetriever.retrieveContext(context);
                    CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.getValue() + 1);
                    JavaPairRDD<Object, Row> resultsByContext = rowsByContextId.flatMapToPair(x -> evaluate(contextName, resultsSchema, x, dataTypeAliases, perContextAccum, errorAccumulator, batchRunTime));
                    writeResults(spark, resultsSchema, resultsByContext, outputPath);
                    long contextEndMillis = System.currentTimeMillis();
                    LOG.info(String.format("Wrote results for context %s to %s", contextName, outputPath));
                    evaluationSummary.addContextCount(contextName, perContextAccum.value());
                    evaluationSummary.addContextRuntime(contextName, contextEndMillis - contextStartMillis);
                    contextAccum.add(1);
                    perContextAccum.reset();
                }
            }
            CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
            try {
                Boolean metricsEnabledStr = Boolean.valueOf(spark.conf().get("spark.ui.prometheus.enabled"));
                if (metricsEnabledStr) {
                    LOG.info("Prometheus metrics enabled, sleeping for 7 seconds to finish gathering metrics");
                    // sleep for over 5 seconds because Prometheus only polls
                    // every 5 seconds. If spark finishes and goes away immediately after completing,
                    // Prometheus will never be able to poll for the final set of metrics for the spark-submit
                    // The default promtheus config map was changed from 2 minute scrape interval to 5 seconds for spark pods
                    Thread.sleep(7000);
                } else {
                    LOG.info("Prometheus metrics not enabled");
                }
            } catch (NoSuchElementException e) {
                LOG.info("spark.ui.prometheus.enabled is not set");
            }
            evaluationSummary.setJobStatus(JobStatus.SUCCESS);
        } catch (Exception e) {
            // If we experience an error that would make the program halt, capture the error
            // and report it in the batch summary file
            ByteArrayOutputStream errorDetailStream = new ByteArrayOutputStream();
            try (PrintStream printStream = new PrintStream(errorDetailStream)) {
                printStream.write(e.getMessage().getBytes());
                printStream.write('\n');
                if (e.getCause() != null) {
                    printStream.write(e.getCause().getMessage().getBytes());
                    printStream.write('\n');
                }
                e.printStackTrace(printStream);
                evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError(null, null, null, errorDetailStream.toString())));
            }
            throw e;
        } finally {
            long endTimeMillis = System.currentTimeMillis();
            evaluationSummary.setEndTimeMillis(endTimeMillis);
            evaluationSummary.setRuntimeMillis(endTimeMillis - startTimeMillis);
            if (args.metadataOutputPath != null) {
                if (evaluationSummary.getErrorList() == null) {
                    evaluationSummary.setErrorList(errorAccumulator.value());
                }
                if (CollectionUtils.isNotEmpty(evaluationSummary.getErrorList())) {
                    evaluationSummary.setJobStatus(JobStatus.FAIL);
                }
                evaluationSummary.setTotalContexts(contextAccum.value());
                OutputMetadataWriter writer = getOutputMetadataWriter();
                writer.writeMetadata(evaluationSummary);
            }
        }
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ContextRetriever(com.ibm.cohort.cql.spark.aggregation.ContextRetriever) CqlToElmTranslator(com.ibm.cohort.cql.translation.CqlToElmTranslator) ContextDefinitions(com.ibm.cohort.cql.spark.aggregation.ContextDefinitions) ZonedDateTime(java.time.ZonedDateTime) List(java.util.List) ArrayList(java.util.ArrayList) DefaultDatasetRetriever(com.ibm.cohort.cql.spark.data.DefaultDatasetRetriever) PrintStream(java.io.PrintStream) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) SparkTypeConverter(com.ibm.cohort.cql.spark.data.SparkTypeConverter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ContextDefinition(com.ibm.cohort.cql.spark.aggregation.ContextDefinition) FileNotFoundException(java.io.FileNotFoundException) NoSuchElementException(java.util.NoSuchElementException) IOException(java.io.IOException) LongAccumulator(org.apache.spark.util.LongAccumulator) EvaluationSummary(com.ibm.cohort.cql.spark.metadata.EvaluationSummary) HadoopPathOutputMetadataWriter(com.ibm.cohort.cql.spark.metadata.HadoopPathOutputMetadataWriter) OutputMetadataWriter(com.ibm.cohort.cql.spark.metadata.OutputMetadataWriter) ColumnRuleCreator(com.ibm.cohort.cql.spark.aggregation.ColumnRuleCreator) SparkOutputColumnEncoder(com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder) EvaluationError(com.ibm.cohort.cql.spark.errors.EvaluationError) SparkDataRow(com.ibm.cohort.cql.spark.data.SparkDataRow) DataRow(com.ibm.cohort.datarow.model.DataRow) Row(org.apache.spark.sql.Row) NoSuchElementException(java.util.NoSuchElementException)

Aggregations

SparkOutputColumnEncoder (com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder)3 SparkDataRow (com.ibm.cohort.cql.spark.data.SparkDataRow)2 DataRow (com.ibm.cohort.datarow.model.DataRow)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 CqlDataProvider (com.ibm.cohort.cql.data.CqlDataProvider)1 CqlEvaluationRequests (com.ibm.cohort.cql.evaluation.CqlEvaluationRequests)1 CqlEvaluator (com.ibm.cohort.cql.evaluation.CqlEvaluator)1 ColumnRuleCreator (com.ibm.cohort.cql.spark.aggregation.ColumnRuleCreator)1 ContextDefinition (com.ibm.cohort.cql.spark.aggregation.ContextDefinition)1 ContextDefinitions (com.ibm.cohort.cql.spark.aggregation.ContextDefinitions)1 ContextRetriever (com.ibm.cohort.cql.spark.aggregation.ContextRetriever)1 DefaultDatasetRetriever (com.ibm.cohort.cql.spark.data.DefaultDatasetRetriever)1 SparkTypeConverter (com.ibm.cohort.cql.spark.data.SparkTypeConverter)1 EvaluationError (com.ibm.cohort.cql.spark.errors.EvaluationError)1 EvaluationSummary (com.ibm.cohort.cql.spark.metadata.EvaluationSummary)1 HadoopPathOutputMetadataWriter (com.ibm.cohort.cql.spark.metadata.HadoopPathOutputMetadataWriter)1 OutputMetadataWriter (com.ibm.cohort.cql.spark.metadata.OutputMetadataWriter)1 CqlToElmTranslator (com.ibm.cohort.cql.translation.CqlToElmTranslator)1 DataRowDataProvider (com.ibm.cohort.datarow.engine.DataRowDataProvider)1