Search in sources :

Example 1 with DatasourceProvider

use of com.thinkbiganalytics.spark.shell.DatasourceProvider in project kylo by Teradata.

the class TransformServiceTest method executeWithDatasourceProviderFactory.

/**
 * Verify executing a transformation request with a data source provider factory.
 */
@Test
@SuppressWarnings("unchecked")
public void executeWithDatasourceProviderFactory() throws Exception {
    // Mock data set
    final DataSet dataSet = Mockito.mock(DataSet.class);
    Mockito.when(dataSet.persist(Mockito.any(StorageLevel.class))).thenReturn(dataSet);
    Mockito.when(dataSet.schema()).thenReturn(new StructType());
    // Mock Spark context service
    final SparkContextService sparkContextService = Mockito.mock(SparkContextService.class);
    // Mock Spark script engine
    final SparkScriptEngine engine = Mockito.mock(SparkScriptEngine.class);
    Mockito.when(engine.eval(Mockito.anyString(), Mockito.anyListOf(NamedParam.class))).thenReturn(dataSet);
    Mockito.when(engine.getSparkContext()).thenReturn(Mockito.mock(SparkContext.class));
    // Mock data source provider factory
    final DatasourceProvider datasourceProvider = Mockito.mock(DatasourceProvider.class);
    final DatasourceProviderFactory datasourceProviderFactory = Mockito.mock(DatasourceProviderFactory.class);
    Mockito.when(datasourceProviderFactory.getDatasourceProvider(Mockito.anyCollectionOf(Datasource.class), Mockito.anyCollectionOf(DataSource.class))).thenReturn(datasourceProvider);
    // Mock profiler
    final Profiler profiler = Mockito.mock(Profiler.class);
    // Test executing a request
    final TransformRequest request = new TransformRequest();
    request.setDoProfile(true);
    request.setDatasources(Collections.singletonList(Mockito.mock(Datasource.class)));
    request.setScript("sqlContext.range(1,10)");
    final TransformService service = new TransformService(TransformScript.class, engine, sparkContextService, new MockJobTrackerService(), Mockito.mock(DataSetConverterService.class), Mockito.mock(KyloCatalogClientBuilder.class));
    service.setDatasourceProviderFactory(datasourceProviderFactory);
    service.setProfiler(profiler);
    final TransformResponse response = service.execute(request);
    Assert.assertEquals(TransformResponse.Status.PENDING, response.getStatus());
    // Test eval arguments
    final ArgumentCaptor<String> evalScript = ArgumentCaptor.forClass(String.class);
    final ArgumentCaptor<List> evalBindings = ArgumentCaptor.forClass(List.class);
    Mockito.verify(engine).eval(evalScript.capture(), evalBindings.capture());
    InputStream inputStream = getClass().getResourceAsStream("transform-service-script1.scala");
    final String expectedScript = IOUtils.toString(inputStream, "UTF-8");
    inputStream.close();
    Assert.assertEquals(expectedScript, evalScript.getValue());
    final List<NamedParam> bindings = evalBindings.getValue();
    Assert.assertEquals(2, bindings.size());
    Assert.assertEquals("sparkContextService", bindings.get(0).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.SparkContextService", bindings.get(0).tpe());
    Assert.assertEquals(sparkContextService, bindings.get(0).value());
    Assert.assertEquals("datasourceProvider", bindings.get(1).name());
    Assert.assertEquals("com.thinkbiganalytics.spark.shell.DatasourceProvider[org.apache.spark.sql.DataFrame]", bindings.get(1).tpe());
    Assert.assertEquals(datasourceProvider, bindings.get(1).value());
}
Also used : Datasource(com.thinkbiganalytics.spark.rest.model.Datasource) DatasourceProvider(com.thinkbiganalytics.spark.shell.DatasourceProvider) StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet) InputStream(java.io.InputStream) DatasourceProviderFactory(com.thinkbiganalytics.spark.shell.DatasourceProviderFactory) NamedParam(scala.tools.nsc.interpreter.NamedParam) TransformRequest(com.thinkbiganalytics.spark.rest.model.TransformRequest) DataSource(com.thinkbiganalytics.kylo.catalog.rest.model.DataSource) SparkContext(org.apache.spark.SparkContext) Profiler(com.thinkbiganalytics.spark.dataprofiler.Profiler) KyloCatalogClientBuilder(com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClientBuilder) SparkContextService(com.thinkbiganalytics.spark.SparkContextService) TransformResponse(com.thinkbiganalytics.spark.rest.model.TransformResponse) List(java.util.List) SparkScriptEngine(com.thinkbiganalytics.spark.repl.SparkScriptEngine) StorageLevel(org.apache.spark.storage.StorageLevel) Test(org.junit.Test)

Example 2 with DatasourceProvider

use of com.thinkbiganalytics.spark.shell.DatasourceProvider in project kylo by Teradata.

the class TransformService method createShellTask.

/**
 * Creates a new Spark shell transformation.
 */
@Nonnull
private DataSet createShellTask(@Nonnull final TransformRequest request) throws ScriptException {
    log.entry(request);
    // Build bindings list
    final List<NamedParam> bindings = new ArrayList<>();
    bindings.add(new NamedParamClass("sparkContextService", SparkContextService.class.getName(), sparkContextService));
    if ((request.getDatasources() != null && !request.getDatasources().isEmpty()) || (request.getCatalogDataSources() != null && !request.getCatalogDataSources().isEmpty())) {
        if (datasourceProviderFactory != null) {
            List<Datasource> legacyDataSources = request.getDatasources() != null ? request.getDatasources() : new ArrayList<Datasource>();
            List<DataSource> catalogDataSources = request.getCatalogDataSources() != null ? request.getCatalogDataSources() : new ArrayList<DataSource>();
            final DatasourceProvider datasourceProvider = datasourceProviderFactory.getDatasourceProvider(legacyDataSources, catalogDataSources);
            bindings.add(new NamedParamClass("datasourceProvider", DatasourceProvider.class.getName() + "[org.apache.spark.sql.DataFrame]", datasourceProvider));
        } else {
            throw log.throwing(new ScriptException("Script cannot be executed because no data source provider factory is available."));
        }
    }
    if (request.getCatalogDatasets() != null && !request.getCatalogDatasets().isEmpty()) {
        if (catalogDataSetProviderFactory != null) {
            log.info("Creating new Shell task with {} data sets ", request.getCatalogDatasets().size());
            final CatalogDataSetProvider catalogDataSetProvider = catalogDataSetProviderFactory.getDataSetProvider(request.getCatalogDatasets());
            bindings.add(new NamedParamClass("catalogDataSetProvider", CatalogDataSetProvider.class.getName() + "[org.apache.spark.sql.DataFrame]", catalogDataSetProvider));
        } else {
            throw log.throwing(new ScriptException("Script cannot be executed because no data source provider factory is available."));
        }
    }
    // Ensure SessionState is valid
    if (SessionState.get() == null && sessionState != null) {
        SessionState.setCurrentSessionState(sessionState);
    }
    // Execute script
    final Object result;
    try {
        result = this.engine.eval(toTransformScript(request), bindings);
    } catch (final Exception cause) {
        throw log.throwing(new ScriptException(cause));
    }
    if (result instanceof DataSet) {
        return log.exit((DataSet) result);
    } else {
        throw log.throwing(new IllegalStateException("Unexpected script result type: " + (result != null ? result.getClass() : null)));
    }
}
Also used : Datasource(com.thinkbiganalytics.spark.rest.model.Datasource) JdbcDatasource(com.thinkbiganalytics.spark.rest.model.JdbcDatasource) DatasourceProvider(com.thinkbiganalytics.spark.shell.DatasourceProvider) NamedParamClass(scala.tools.nsc.interpreter.NamedParamClass) DataSet(com.thinkbiganalytics.spark.DataSet) ArrayList(java.util.ArrayList) NamedParam(scala.tools.nsc.interpreter.NamedParam) CatalogDataSetProvider(com.thinkbiganalytics.spark.shell.CatalogDataSetProvider) TimeoutException(java.util.concurrent.TimeoutException) ScriptException(javax.script.ScriptException) ExecutionException(java.util.concurrent.ExecutionException) DataSource(com.thinkbiganalytics.kylo.catalog.rest.model.DataSource) ScriptException(javax.script.ScriptException) Nonnull(javax.annotation.Nonnull)

Example 3 with DatasourceProvider

use of com.thinkbiganalytics.spark.shell.DatasourceProvider in project kylo by Teradata.

the class App method main.

/**
 * Evaluates a Scala file.
 *
 * @param args the command-line arguments
 * @throws Exception if an error occurs
 */
public static void main(@Nonnull String[] args) throws Exception {
    // Verify arguments
    if (args.length != 1) {
        System.err.println("error: usage: SparkShellApp file");
        System.exit(1);
    }
    // Load environment
    final AnnotationConfigApplicationContext ctx = new AnnotationConfigApplicationContext();
    ctx.register(SecurityCoreConfig.class);
    ctx.scan("com.thinkbiganalytics.spark", "com.thinkbiganalytics.kylo.catalog");
    ctx.refresh();
    File scriptFile = new File(args[0]);
    if (scriptFile.exists() && scriptFile.isFile()) {
        log.info("Loading script file at {} ", args[0]);
    } else {
        log.info("Couldn't find script file at {} will check classpath.", args[0]);
        String fileName = scriptFile.getName();
        scriptFile = new File("./" + fileName);
    }
    final String script = Files.toString(scriptFile, Charsets.UTF_8);
    // Prepare bindings
    final List<NamedParam> bindings = new ArrayList<>();
    final DatasourceProvider datasourceProvider = ctx.getBean(DatasourceProvider.class);
    bindings.add(new NamedParamClass("datasourceProvider", datasourceProvider.getClass().getName(), datasourceProvider));
    final CatalogDataSetProvider catalogDataSetProvider = ctx.getBean(CatalogDataSetProvider.class);
    bindings.add(new NamedParamClass("catalogDataSetProvider", catalogDataSetProvider.getClass().getName(), catalogDataSetProvider));
    // Execute script
    final SparkScriptEngine engine = ctx.getBean(SparkScriptEngine.class);
    engine.eval(script, bindings);
}
Also used : DatasourceProvider(com.thinkbiganalytics.spark.shell.DatasourceProvider) AnnotationConfigApplicationContext(org.springframework.context.annotation.AnnotationConfigApplicationContext) NamedParamClass(scala.tools.nsc.interpreter.NamedParamClass) ArrayList(java.util.ArrayList) NamedParam(scala.tools.nsc.interpreter.NamedParam) SparkScriptEngine(com.thinkbiganalytics.spark.repl.SparkScriptEngine) CatalogDataSetProvider(com.thinkbiganalytics.spark.shell.CatalogDataSetProvider) File(java.io.File)

Aggregations

DatasourceProvider (com.thinkbiganalytics.spark.shell.DatasourceProvider)3 NamedParam (scala.tools.nsc.interpreter.NamedParam)3 DataSource (com.thinkbiganalytics.kylo.catalog.rest.model.DataSource)2 DataSet (com.thinkbiganalytics.spark.DataSet)2 SparkScriptEngine (com.thinkbiganalytics.spark.repl.SparkScriptEngine)2 Datasource (com.thinkbiganalytics.spark.rest.model.Datasource)2 CatalogDataSetProvider (com.thinkbiganalytics.spark.shell.CatalogDataSetProvider)2 ArrayList (java.util.ArrayList)2 NamedParamClass (scala.tools.nsc.interpreter.NamedParamClass)2 KyloCatalogClientBuilder (com.thinkbiganalytics.kylo.catalog.api.KyloCatalogClientBuilder)1 SparkContextService (com.thinkbiganalytics.spark.SparkContextService)1 Profiler (com.thinkbiganalytics.spark.dataprofiler.Profiler)1 JdbcDatasource (com.thinkbiganalytics.spark.rest.model.JdbcDatasource)1 TransformRequest (com.thinkbiganalytics.spark.rest.model.TransformRequest)1 TransformResponse (com.thinkbiganalytics.spark.rest.model.TransformResponse)1 DatasourceProviderFactory (com.thinkbiganalytics.spark.shell.DatasourceProviderFactory)1 File (java.io.File)1 InputStream (java.io.InputStream)1 List (java.util.List)1 ExecutionException (java.util.concurrent.ExecutionException)1