Search in sources :

Example 1 with SerializableConfiguration

use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluatorTest method testReadContextDefinitions.

@Test
public void testReadContextDefinitions() throws Exception {
    evaluator.hadoopConfiguration = new SerializableConfiguration(SparkHadoopUtil.get().conf());
    ContextDefinitions contextDefinitions = evaluator.readContextDefinitions("src/test/resources/alltypes/metadata/context-definitions.json");
    assertNotNull(contextDefinitions);
    assertEquals(5, contextDefinitions.getContextDefinitions().size());
    assertEquals(3, contextDefinitions.getContextDefinitions().get(0).getRelationships().size());
}
Also used : ContextDefinitions(com.ibm.cohort.cql.spark.aggregation.ContextDefinitions) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) Test(org.junit.Test)

Example 2 with SerializableConfiguration

use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluatorTest method testReadCqlJobsInvalid.

@Test
public void testReadCqlJobsInvalid() throws Exception {
    evaluator.hadoopConfiguration = new SerializableConfiguration(SparkHadoopUtil.get().conf());
    assertThrows(IllegalArgumentException.class, () -> evaluator.readJobSpecification("src/test/resources/invalid/cql-jobs-invalid-global.json"));
}
Also used : SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) Test(org.junit.Test)

Example 3 with SerializableConfiguration

use of org.apache.spark.util.SerializableConfiguration in project hudi by apache.

the class ColumnStatsIndexHelper method buildColumnStatsTableFor.

/**
 * Parse min/max statistics from Parquet footers for provided columns and composes column-stats
 * index table in the following format with 3 statistics denominated for each
 * linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
 * column {@code A}:
 *
 * <pre>
 * +---------------------------+------------+------------+-------------+
 * |          file             | A_minValue | A_maxValue | A_num_nulls |
 * +---------------------------+------------+------------+-------------+
 * | one_base_file.parquet     |          1 |         10 |           0 |
 * | another_base_file.parquet |        -10 |          0 |           5 |
 * +---------------------------+------------+------------+-------------+
 * </pre>
 *
 * NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
 * does not support statistics for it.
 *
 * TODO leverage metadata table after RFC-27 lands
 * @VisibleForTesting
 *
 * @param sparkSession encompassing Spark session
 * @param baseFilesPaths list of base-files paths to be sourced for column-stats index
 * @param orderedColumnSchemas target ordered columns
 * @return Spark's {@link Dataset} holding an index table
 */
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
    SparkContext sc = sparkSession.sparkContext();
    JavaSparkContext jsc = new JavaSparkContext(sc);
    SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
    int numParallelism = (baseFilesPaths.size() / 3 + 1);
    List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
    String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
    try {
        jsc.setJobDescription("Listing parquet column statistics");
        colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
            ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
            Iterable<String> iterable = () -> paths;
            return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
        }).collect();
    } finally {
        jsc.setJobDescription(previousJobDescription);
    }
    // Group column's metadata by file-paths of the files it belongs to
    Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
    JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
        int colSize = fileColumnsMetadata.size();
        if (colSize == 0) {
            return null;
        }
        String filePath = fileColumnsMetadata.get(0).getFilePath();
        List<Object> indexRow = new ArrayList<>();
        // First columns of the Z-index's row is target file-path
        indexRow.add(filePath);
        // For each column
        orderedColumnSchemas.forEach(colSchema -> {
            String colName = colSchema.name();
            HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
            DataType colType = colSchema.dataType();
            if (colMetadata == null || colType == null) {
                throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
            }
            Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
            // min
            indexRow.add(minMaxValue.getLeft());
            // max
            indexRow.add(minMaxValue.getRight());
            indexRow.add(colMetadata.getNullCount());
        });
        return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
    }).filter(Objects::nonNull);
    StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
    return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
Also used : BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) DecimalType(org.apache.spark.sql.types.DecimalType) FileStatus(org.apache.hadoop.fs.FileStatus) ByteBuffer(java.nio.ByteBuffer) Logger(org.apache.log4j.Logger) BigDecimal(java.math.BigDecimal) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DoubleType(org.apache.spark.sql.types.DoubleType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DataTypeUtils.areCompatible(org.apache.hudi.util.DataTypeUtils.areCompatible) IntegerType(org.apache.spark.sql.types.IntegerType) SparkContext(org.apache.spark.SparkContext) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) UUID(java.util.UUID) TimestampType(org.apache.spark.sql.types.TimestampType) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) BooleanType(org.apache.spark.sql.types.BooleanType) Dataset(org.apache.spark.sql.Dataset) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) FloatType(org.apache.spark.sql.types.FloatType) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) LongType$(org.apache.spark.sql.types.LongType$) StructType$(org.apache.spark.sql.types.StructType$) ArrayList(java.util.ArrayList) ByteType(org.apache.spark.sql.types.ByteType) StreamSupport(java.util.stream.StreamSupport) Nonnull(javax.annotation.Nonnull) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) JavaConversions(scala.collection.JavaConversions) Row$(org.apache.spark.sql.Row$) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) ShortType(org.apache.spark.sql.types.ShortType) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) LogManager(org.apache.log4j.LogManager) DateType(org.apache.spark.sql.types.DateType) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) StructType(org.apache.spark.sql.types.StructType) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) HoodieException(org.apache.hudi.exception.HoodieException) SparkContext(org.apache.spark.SparkContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) Objects(java.util.Objects) DataType(org.apache.spark.sql.types.DataType) List(java.util.List) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) Pair(org.apache.hudi.common.util.collection.Pair) Nonnull(javax.annotation.Nonnull)

Example 4 with SerializableConfiguration

use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluator method run.

public void run(PrintStream out) throws Exception {
    EvaluationSummary evaluationSummary = new EvaluationSummary();
    long startTimeMillis = System.currentTimeMillis();
    evaluationSummary.setStartTimeMillis(startTimeMillis);
    evaluationSummary.setJobStatus(JobStatus.FAIL);
    SparkSession.Builder sparkBuilder = SparkSession.builder();
    try (SparkSession spark = sparkBuilder.getOrCreate()) {
        final LongAccumulator contextAccum = spark.sparkContext().longAccumulator("Context");
        final CollectionAccumulator<EvaluationError> errorAccumulator = spark.sparkContext().collectionAccumulator("EvaluationErrors");
        try {
            spark.sparkContext().setLocalProperty("mdc." + CORRELATION_ID, MDC.get(CORRELATION_ID));
            evaluationSummary.setCorrelationId(MDC.get(CORRELATION_ID));
            boolean useJava8API = Boolean.valueOf(spark.conf().get("spark.sql.datetime.java8API.enabled"));
            this.typeConverter = new SparkTypeConverter(useJava8API);
            this.hadoopConfiguration = new SerializableConfiguration(spark.sparkContext().hadoopConfiguration());
            evaluationSummary.setApplicationId(spark.sparkContext().applicationId());
            CqlToElmTranslator cqlTranslator = getCqlTranslator();
            SparkOutputColumnEncoder columnEncoder = getSparkOutputColumnEncoder();
            ContextDefinitions contexts = readContextDefinitions(args.contextDefinitionPath);
            List<ContextDefinition> filteredContexts = contexts.getContextDefinitions();
            if (args.aggregationContexts != null && !args.aggregationContexts.isEmpty()) {
                filteredContexts = filteredContexts.stream().filter(def -> args.aggregationContexts.contains(def.getName())).collect(Collectors.toList());
            }
            if (filteredContexts.isEmpty()) {
                throw new IllegalArgumentException("At least one context definition is required (after filtering if enabled).");
            }
            Map<String, StructType> resultSchemas = calculateSparkSchema(filteredContexts.stream().map(ContextDefinition::getName).collect(Collectors.toList()), contexts, columnEncoder, cqlTranslator);
            ZonedDateTime batchRunTime = ZonedDateTime.now();
            final LongAccumulator perContextAccum = spark.sparkContext().longAccumulator("PerContext");
            CustomMetricSparkPlugin.contextAccumGauge.setAccumulator(contextAccum);
            CustomMetricSparkPlugin.perContextAccumGauge.setAccumulator(perContextAccum);
            CustomMetricSparkPlugin.totalContextsToProcessCounter.inc(filteredContexts.size());
            CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
            ColumnRuleCreator columnRuleCreator = new ColumnRuleCreator(getFilteredJobSpecificationWithIds().getEvaluations(), getCqlTranslator(), createLibraryProvider());
            Map<String, String> dataTypeAliases = createDataTypeAliases(filteredContexts, cqlTranslator);
            for (ContextDefinition context : filteredContexts) {
                final String contextName = context.getName();
                ContextRetriever contextRetriever = new ContextRetriever(args.inputPaths, new DefaultDatasetRetriever(spark, args.inputFormat), args.disableColumnFiltering ? null : columnRuleCreator.getDataRequirementsForContext(context));
                StructType resultsSchema = resultSchemas.get(contextName);
                if (resultsSchema == null || resultsSchema.fields().length == 0) {
                    LOG.warn("Context " + contextName + " has no defines configured. Skipping.");
                } else {
                    LOG.info("Evaluating context " + contextName);
                    long contextStartMillis = System.currentTimeMillis();
                    final String outputPath = MapUtils.getRequiredKey(args.outputPaths, context.getName(), "outputPath");
                    JavaPairRDD<Object, List<Row>> rowsByContextId = contextRetriever.retrieveContext(context);
                    CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.getValue() + 1);
                    JavaPairRDD<Object, Row> resultsByContext = rowsByContextId.flatMapToPair(x -> evaluate(contextName, resultsSchema, x, dataTypeAliases, perContextAccum, errorAccumulator, batchRunTime));
                    writeResults(spark, resultsSchema, resultsByContext, outputPath);
                    long contextEndMillis = System.currentTimeMillis();
                    LOG.info(String.format("Wrote results for context %s to %s", contextName, outputPath));
                    evaluationSummary.addContextCount(contextName, perContextAccum.value());
                    evaluationSummary.addContextRuntime(contextName, contextEndMillis - contextStartMillis);
                    contextAccum.add(1);
                    perContextAccum.reset();
                }
            }
            CustomMetricSparkPlugin.currentlyEvaluatingContextGauge.setValue(0);
            try {
                Boolean metricsEnabledStr = Boolean.valueOf(spark.conf().get("spark.ui.prometheus.enabled"));
                if (metricsEnabledStr) {
                    LOG.info("Prometheus metrics enabled, sleeping for 7 seconds to finish gathering metrics");
                    // sleep for over 5 seconds because Prometheus only polls
                    // every 5 seconds. If spark finishes and goes away immediately after completing,
                    // Prometheus will never be able to poll for the final set of metrics for the spark-submit
                    // The default promtheus config map was changed from 2 minute scrape interval to 5 seconds for spark pods
                    Thread.sleep(7000);
                } else {
                    LOG.info("Prometheus metrics not enabled");
                }
            } catch (NoSuchElementException e) {
                LOG.info("spark.ui.prometheus.enabled is not set");
            }
            evaluationSummary.setJobStatus(JobStatus.SUCCESS);
        } catch (Exception e) {
            // If we experience an error that would make the program halt, capture the error
            // and report it in the batch summary file
            ByteArrayOutputStream errorDetailStream = new ByteArrayOutputStream();
            try (PrintStream printStream = new PrintStream(errorDetailStream)) {
                printStream.write(e.getMessage().getBytes());
                printStream.write('\n');
                if (e.getCause() != null) {
                    printStream.write(e.getCause().getMessage().getBytes());
                    printStream.write('\n');
                }
                e.printStackTrace(printStream);
                evaluationSummary.setErrorList(Collections.singletonList(new EvaluationError(null, null, null, errorDetailStream.toString())));
            }
            throw e;
        } finally {
            long endTimeMillis = System.currentTimeMillis();
            evaluationSummary.setEndTimeMillis(endTimeMillis);
            evaluationSummary.setRuntimeMillis(endTimeMillis - startTimeMillis);
            if (args.metadataOutputPath != null) {
                if (evaluationSummary.getErrorList() == null) {
                    evaluationSummary.setErrorList(errorAccumulator.value());
                }
                if (CollectionUtils.isNotEmpty(evaluationSummary.getErrorList())) {
                    evaluationSummary.setJobStatus(JobStatus.FAIL);
                }
                evaluationSummary.setTotalContexts(contextAccum.value());
                OutputMetadataWriter writer = getOutputMetadataWriter();
                writer.writeMetadata(evaluationSummary);
            }
        }
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ContextRetriever(com.ibm.cohort.cql.spark.aggregation.ContextRetriever) CqlToElmTranslator(com.ibm.cohort.cql.translation.CqlToElmTranslator) ContextDefinitions(com.ibm.cohort.cql.spark.aggregation.ContextDefinitions) ZonedDateTime(java.time.ZonedDateTime) List(java.util.List) ArrayList(java.util.ArrayList) DefaultDatasetRetriever(com.ibm.cohort.cql.spark.data.DefaultDatasetRetriever) PrintStream(java.io.PrintStream) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) SparkTypeConverter(com.ibm.cohort.cql.spark.data.SparkTypeConverter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ContextDefinition(com.ibm.cohort.cql.spark.aggregation.ContextDefinition) FileNotFoundException(java.io.FileNotFoundException) NoSuchElementException(java.util.NoSuchElementException) IOException(java.io.IOException) LongAccumulator(org.apache.spark.util.LongAccumulator) EvaluationSummary(com.ibm.cohort.cql.spark.metadata.EvaluationSummary) HadoopPathOutputMetadataWriter(com.ibm.cohort.cql.spark.metadata.HadoopPathOutputMetadataWriter) OutputMetadataWriter(com.ibm.cohort.cql.spark.metadata.OutputMetadataWriter) ColumnRuleCreator(com.ibm.cohort.cql.spark.aggregation.ColumnRuleCreator) SparkOutputColumnEncoder(com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder) EvaluationError(com.ibm.cohort.cql.spark.errors.EvaluationError) SparkDataRow(com.ibm.cohort.cql.spark.data.SparkDataRow) DataRow(com.ibm.cohort.datarow.model.DataRow) Row(org.apache.spark.sql.Row) NoSuchElementException(java.util.NoSuchElementException)

Example 5 with SerializableConfiguration

use of org.apache.spark.util.SerializableConfiguration in project quality-measure-and-cohort-service by Alvearie.

the class SparkCqlEvaluatorTest method testReadCqlJobsSuccess.

@Test
public void testReadCqlJobsSuccess() throws Exception {
    IntervalParameter measurementPeriod = new IntervalParameter();
    measurementPeriod.setStart(new DateParameter("2020-01-01")).setEnd(new DateParameter("2021-01-01"));
    IntegerParameter minimumAge = new IntegerParameter(17);
    evaluator.hadoopConfiguration = new SerializableConfiguration(SparkHadoopUtil.get().conf());
    CqlEvaluationRequests requests = evaluator.readJobSpecification("src/test/resources/simple-job/cql-jobs.json");
    assertNotNull(requests);
    assertEquals(measurementPeriod, requests.getGlobalParameters().get("Measurement Period"));
    assertEquals(1, requests.getEvaluations().size());
    assertEquals(minimumAge, requests.getEvaluations().get(0).getParameters().get("MinimumAge"));
}
Also used : IntegerParameter(com.ibm.cohort.cql.evaluation.parameters.IntegerParameter) DateParameter(com.ibm.cohort.cql.evaluation.parameters.DateParameter) SerializableConfiguration(org.apache.spark.util.SerializableConfiguration) CqlEvaluationRequests(com.ibm.cohort.cql.evaluation.CqlEvaluationRequests) IntervalParameter(com.ibm.cohort.cql.evaluation.parameters.IntervalParameter) Test(org.junit.Test)

Aggregations

SerializableConfiguration (org.apache.spark.util.SerializableConfiguration)6 Test (org.junit.Test)4 CqlEvaluationRequests (com.ibm.cohort.cql.evaluation.CqlEvaluationRequests)2 ContextDefinitions (com.ibm.cohort.cql.spark.aggregation.ContextDefinitions)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Row (org.apache.spark.sql.Row)2 SparkSession (org.apache.spark.sql.SparkSession)2 DateParameter (com.ibm.cohort.cql.evaluation.parameters.DateParameter)1 IntegerParameter (com.ibm.cohort.cql.evaluation.parameters.IntegerParameter)1 IntervalParameter (com.ibm.cohort.cql.evaluation.parameters.IntervalParameter)1 ColumnRuleCreator (com.ibm.cohort.cql.spark.aggregation.ColumnRuleCreator)1 ContextDefinition (com.ibm.cohort.cql.spark.aggregation.ContextDefinition)1 ContextRetriever (com.ibm.cohort.cql.spark.aggregation.ContextRetriever)1 DefaultDatasetRetriever (com.ibm.cohort.cql.spark.data.DefaultDatasetRetriever)1 SparkDataRow (com.ibm.cohort.cql.spark.data.SparkDataRow)1 SparkOutputColumnEncoder (com.ibm.cohort.cql.spark.data.SparkOutputColumnEncoder)1 SparkTypeConverter (com.ibm.cohort.cql.spark.data.SparkTypeConverter)1 EvaluationError (com.ibm.cohort.cql.spark.errors.EvaluationError)1