Search in sources :

Example 1 with SparkContext$

use of org.apache.spark.SparkContext$ in project kylo by Teradata.

the class Validator method run.

private void run(@Nonnull final PrintStream out, @Nonnull final String... args) {
    // Check how many arguments were passed in
    if (args.length < 4) {
        String msg = "Proper Usage is: <targetDatabase> <entity> <partition> <path-to-policy-file>\n" + "You can optionally add: --hiveConf hive.setting=value --hiveConf hive.other.setting=value\n" + "You can optionally add: --storageLevel rdd_persistence_level_value\n" + "You can optionally add: --numPartitions number_of_rdd_partitions\n" + "You provided " + args.length + " args which are (comma separated): " + StringUtils.join(args, ",");
        out.println(msg);
        throw new IllegalArgumentException(msg);
    }
    final SparkContext sparkContext = SparkContext.getOrCreate();
    try {
        final ValidatorConfiguration params = new ValidatorConfiguration(args);
        // Initialize Spring context
        try (final ConfigurableApplicationContext ctx = new AnnotationConfigApplicationContext("com.thinkbiganalytics.spark")) {
            final DataValidator app = ctx.getBean(DataValidator.class);
            // Prepare Hive context
            final HiveContext hiveContext = new HiveContext(sparkContext);
            for (final Param param : params.getHiveParams()) {
                log.info("Adding Hive parameter {}={}", param.getName(), param.getValue());
                hiveContext.setConf(param.getName(), param.getValue());
            }
            log.info("Deployment Mode - {}", hiveContext.sparkContext().getConf().get("spark.submit.deployMode"));
            Map<String, FieldPolicy> policyMap = ctx.getBean(FieldPolicyLoader.class).loadFieldPolicy(params.getFieldPolicyJsonPath());
            // Run validation
            final DataValidatorResult results = app.validateTable(params.getTargetDatabase(), params.getFeedTableName(), params.getValidTableName(), params.getPartition(), params.getNumPartitions(), policyMap, hiveContext);
            log.info("Persistence level: {}", params.getStorageLevel());
            results.persist(StorageLevel.fromString(params.getStorageLevel()));
            app.saveInvalidToTable(params.getTargetDatabase(), params.getFeedTableName(), params.getInvalidTableName(), results, hiveContext);
            app.saveValidToTable(params.getTargetDatabase(), params.getFeedTableName(), params.getValidTableName(), results, hiveContext);
            app.saveProfileToTable(params.getTargetDatabase(), params.getProfileTableName(), params.getPartition(), results, hiveContext);
            results.unpersist();
        }
        log.info("Validator app finished");
    } catch (Exception e) {
        log.error("Failed to perform validation: {}", e.toString(), e);
        throw e;
    }
}
Also used : ConfigurableApplicationContext(org.springframework.context.ConfigurableApplicationContext) AnnotationConfigApplicationContext(org.springframework.context.annotation.AnnotationConfigApplicationContext) FieldPolicy(com.thinkbiganalytics.policy.FieldPolicy) FieldPolicyLoader(com.thinkbiganalytics.spark.policy.FieldPolicyLoader) SparkContext(org.apache.spark.SparkContext) HiveContext(org.apache.spark.sql.hive.HiveContext)

Example 2 with SparkContext$

use of org.apache.spark.SparkContext$ in project kylo by Teradata.

the class DataSourceResourceLoaderTest method addJar.

/**
 * Verify adding a jar to the Spark context.
 */
@Test
public void addJar() {
    // Mock Spark Context
    final SparkContext sparkContext = Mockito.mock(SparkContext.class);
    Mockito.when(sparkContext.hadoopConfiguration()).thenReturn(new Configuration(false));
    // Test adding local:/ jar
    final DataSourceResourceLoader loader = DataSourceResourceLoader.create(sparkContext);
    final String jarUrl = getClass().getResource("./").toString();
    loader.addJar(jarUrl.replace("file:", "local:"));
    Mockito.verify(sparkContext, Mockito.times(1)).addJar(jarUrl);
    Assert.assertNotNull(loader.getResource("DataSourceResourceLoaderTest.class"));
}
Also used : SparkContext(org.apache.spark.SparkContext) Configuration(org.apache.hadoop.conf.Configuration) Test(org.junit.Test)

Example 3 with SparkContext$

use of org.apache.spark.SparkContext$ in project kylo by Teradata.

the class MultiSparkExecApp method run.

private void run(@Nonnull final PrintStream out, @Nonnull final String... args) {
    log.info("MultiSparkExecApp running...");
    final SparkContext sparkContext = SparkContext.getOrCreate();
    try {
        final MultiSparkExecArguments sparkExecArgs = new MultiSparkExecArguments(args);
        final List<SparkApplicationCommand> commands = sparkExecArgs.getCommands();
        final List<Class<?>> appClasses = new ArrayList<>(sparkExecArgs.getCommands().size());
        // Get the list of all app classes; verifying each have main() methods.
        for (SparkApplicationCommand cmd : sparkExecArgs.getCommands()) {
            appClasses.add(getApplicationClasses(cmd));
        }
        log.debug("Preparing to execute apps: {}", appClasses);
        for (int idx = 0; idx < appClasses.size(); idx++) {
            Class<?> appClass = appClasses.get(idx);
            SparkApplicationCommand cmd = commands.get(idx);
            System.out.println(">>> Beginning: " + cmd.getName() + " *****************************************************");
            executeApp(appClass, cmd);
            System.out.println("<<< Completed: " + cmd.getName() + " *****************************************************");
        // TODO Generate provenance events.
        }
        log.info("MultiSparkExecApp finished");
    } catch (Exception e) {
        log.error("Execution failed", e);
        throw e;
    } finally {
        sparkContext.stop();
    }
}
Also used : SparkContext(org.apache.spark.SparkContext) ArrayList(java.util.ArrayList) InvocationTargetException(java.lang.reflect.InvocationTargetException)

Example 4 with SparkContext$

use of org.apache.spark.SparkContext$ in project kylo by Teradata.

the class SpringTestConfigV1 method sqlContext.

@Bean
public SQLContext sqlContext() {
    final SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("Profiler Test");
    final SparkContext sc = new SparkContext(conf);
    return new SQLContext(sc);
}
Also used : SparkContext(org.apache.spark.SparkContext) SparkConf(org.apache.spark.SparkConf) SQLContext(org.apache.spark.sql.SQLContext) Bean(org.springframework.context.annotation.Bean)

Example 5 with SparkContext$

use of org.apache.spark.SparkContext$ in project kylo by Teradata.

the class SparkScriptEngine method executeWithSparkClassLoader.

/**
 * Executes the specified callable after replacing the current context class loader.
 *
 * <p>This is a work-around to avoid {@link ClassCastException} issues caused by conflicts between Hadoop and Kylo Spark Shell. Spark uses the context class loader when loading Hadoop components
 * for running Spark on YARN. When both Hadoop and Kylo Spark Shell provide the same class then both classes are loaded when creating a {@link SparkContext}. The fix is to set the context class
 * loader to the same class loader that was used to load the {@link SparkContext} class.</p>
 *
 * @param callable the function to be executed
 * @param <T>      the return type
 * @return the return value
 */
private <T> T executeWithSparkClassLoader(@Nonnull final Callable<T> callable) {
    // Set context class loader
    final Thread currentThread = Thread.currentThread();
    final ClassLoader contextClassLoader = currentThread.getContextClassLoader();
    final ClassLoader sparkClassLoader = new ForwardingClassLoader(SparkContext.class.getClassLoader(), getClassLoader());
    currentThread.setContextClassLoader(sparkClassLoader);
    // Execute callable
    try {
        return callable.call();
    } catch (final Exception e) {
        throw Throwables.propagate(e);
    } finally {
        // Reset context class loader
        currentThread.setContextClassLoader(contextClassLoader);
    }
}
Also used : SparkContext(org.apache.spark.SparkContext) URLClassLoader(java.net.URLClassLoader) ScriptException(javax.script.ScriptException) IOException(java.io.IOException)

Aggregations

SparkContext (org.apache.spark.SparkContext)89 SparkConf (org.apache.spark.SparkConf)37 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)30 Test (org.junit.Test)16 List (java.util.List)13 ArrayList (java.util.ArrayList)12 Configuration (org.apache.hadoop.conf.Configuration)12 SparkSession (org.apache.spark.sql.SparkSession)12 File (java.io.File)11 Row (org.apache.spark.sql.Row)11 IOException (java.io.IOException)10 Tuple2 (scala.Tuple2)10 Map (java.util.Map)9 Set (java.util.Set)8 Function (org.apache.spark.api.java.function.Function)8 SQLContext (org.apache.spark.sql.SQLContext)8 Vector (org.apache.spark.mllib.linalg.Vector)7 Test (org.junit.jupiter.api.Test)7 Edge (org.apache.spark.graphx.Edge)6 LifeCycleManager (com.facebook.airlift.bootstrap.LifeCycleManager)4