Search in sources :

Example 11 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class TransformService method createShellTask.

/**
 * Creates a new Spark shell transformation.
 */
@Nonnull
private DataSet createShellTask(@Nonnull final TransformRequest request) throws ScriptException {
    log.entry(request);
    // Build bindings list
    final List<NamedParam> bindings = new ArrayList<>();
    bindings.add(new NamedParamClass("sparkContextService", SparkContextService.class.getName(), sparkContextService));
    if (request.getDatasources() != null && !request.getDatasources().isEmpty()) {
        if (datasourceProviderFactory != null) {
            final DatasourceProvider datasourceProvider = datasourceProviderFactory.getDatasourceProvider(request.getDatasources());
            bindings.add(new NamedParamClass("datasourceProvider", DatasourceProvider.class.getName() + "[org.apache.spark.sql.DataFrame]", datasourceProvider));
        } else {
            throw log.throwing(new ScriptException("Script cannot be executed because no data source provider factory is available."));
        }
    }
    // Ensure SessionState is valid
    if (SessionState.get() == null && sessionState != null) {
        SessionState.setCurrentSessionState(sessionState);
    }
    // Execute script
    final Object result;
    try {
        result = this.engine.eval(toScript(request), bindings);
    } catch (final Exception cause) {
        throw log.throwing(new ScriptException(cause));
    }
    if (result instanceof DataSet) {
        return log.exit((DataSet) result);
    } else {
        throw log.throwing(new IllegalStateException("Unexpected script result type: " + (result != null ? result.getClass() : null)));
    }
}
Also used : DatasourceProvider(com.thinkbiganalytics.spark.shell.DatasourceProvider) ScriptException(javax.script.ScriptException) NamedParamClass(scala.tools.nsc.interpreter.NamedParamClass) DataSet(com.thinkbiganalytics.spark.DataSet) ArrayList(java.util.ArrayList) NamedParam(scala.tools.nsc.interpreter.NamedParam) TimeoutException(java.util.concurrent.TimeoutException) ScriptException(javax.script.ScriptException) ExecutionException(java.util.concurrent.ExecutionException) Nonnull(javax.annotation.Nonnull)

Example 12 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class TransformService method saveShell.

/**
 * Executes and saves a Spark shell request.
 */
@Nonnull
public SaveResponse saveShell(@Nonnull final String id, @Nonnull final SaveRequest save) throws ScriptException {
    log.entry(id, save);
    final DataSet dataSet = createShellTask(getTransformRequest(id));
    final SaveResponse response = submitSaveJob(createSaveTask(save, new ShellTransformStage(dataSet)));
    return log.exit(response);
}
Also used : DataSet(com.thinkbiganalytics.spark.DataSet) SaveResponse(com.thinkbiganalytics.spark.rest.model.SaveResponse) ShellTransformStage(com.thinkbiganalytics.spark.metadata.ShellTransformStage) Nonnull(javax.annotation.Nonnull)

Example 13 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class StandardDataValidator method saveValidToTable.

@Override
public void saveValidToTable(@Nonnull final String databaseName, @Nonnull final String sourceTableName, @Nonnull final String targetTableName, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
    // Return a new rdd based for Valid Results
    // noinspection serial
    JavaRDD<CleansedRowResult> validResultRDD = result.getCleansedRowResultRDD().filter(new Function<CleansedRowResult, Boolean>() {

        @Override
        public Boolean call(CleansedRowResult cleansedRowResult) throws Exception {
            return cleansedRowResult.isRowValid();
        }
    });
    // Write out the valid records (dropping the two columns)
    final StructType feedTableSchema = scs.toDataSet(hiveContext, HiveUtils.quoteIdentifier(databaseName, sourceTableName)).schema();
    StructType validTableSchema = scs.toDataSet(hiveContext, HiveUtils.quoteIdentifier(databaseName, targetTableName)).schema();
    DataSet validDataFrame = getRows(validResultRDD, ModifiedSchema.getValidTableSchema(feedTableSchema.fields(), validTableSchema.fields(), result.getPolicies()), hiveContext);
    validDataFrame = validDataFrame.drop(REJECT_REASON_COL).toDF();
    // Remove the columns from _valid that dont exist in the validTableName
    writeToTargetTable(validDataFrame, databaseName, targetTableName, hiveContext);
    log.info("wrote values to the valid Table  {}", targetTableName);
}
Also used : StructType(org.apache.spark.sql.types.StructType) DataSet(com.thinkbiganalytics.spark.DataSet)

Example 14 with DataSet

use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.

the class StandardDataValidator method saveProfileToTable.

@Override
public void saveProfileToTable(@Nonnull final String databaseName, @Nonnull final String tableName, @Nonnull final String partition, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
    try {
        // Create a temporary table that can be used to copy data from. Writing directly to the partition from a spark dataframe doesn't work.
        final String tempTable = tableName + "_" + System.currentTimeMillis();
        // Refactor this into something common with profile table
        @SuppressWarnings("squid:S2095") final JavaRDD<OutputRow> statsRDD = JavaSparkContext.fromSparkContext(hiveContext.sparkContext()).parallelize(getProfileStats(result));
        final DataSet df = scs.toDataSet(hiveContext, statsRDD, OutputRow.class);
        df.registerTempTable(tempTable);
        final String insertSQL = "INSERT OVERWRITE TABLE " + HiveUtils.quoteIdentifier(databaseName, tableName) + " PARTITION (processing_dttm='" + partition + "')" + " SELECT columnname, metrictype, metricvalue FROM " + HiveUtils.quoteIdentifier(tempTable);
        log.info("Writing profile stats {}", insertSQL);
        scs.sql(hiveContext, insertSQL);
    } catch (final Exception e) {
        log.error("Failed to insert validation stats", e);
        throw Throwables.propagate(e);
    }
}
Also used : DataSet(com.thinkbiganalytics.spark.DataSet) OutputRow(com.thinkbiganalytics.spark.dataprofiler.output.OutputRow)

Aggregations

DataSet (com.thinkbiganalytics.spark.DataSet)14 StructType (org.apache.spark.sql.types.StructType)7 Nonnull (javax.annotation.Nonnull)4 SparkContext (org.apache.spark.SparkContext)4 TransformResponse (com.thinkbiganalytics.spark.rest.model.TransformResponse)3 Row (org.apache.spark.sql.Row)3 NamedParam (scala.tools.nsc.interpreter.NamedParam)3 SparkContextService (com.thinkbiganalytics.spark.SparkContextService)2 StatisticsModel (com.thinkbiganalytics.spark.dataprofiler.StatisticsModel)2 ShellTransformStage (com.thinkbiganalytics.spark.metadata.ShellTransformStage)2 SparkScriptEngine (com.thinkbiganalytics.spark.repl.SparkScriptEngine)2 TransformRequest (com.thinkbiganalytics.spark.rest.model.TransformRequest)2 DatasourceProvider (com.thinkbiganalytics.spark.shell.DatasourceProvider)2 InputStream (java.io.InputStream)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 HiveContext (org.apache.spark.sql.hive.HiveContext)2 StructField (org.apache.spark.sql.types.StructField)2 StorageLevel (org.apache.spark.storage.StorageLevel)2 Test (org.junit.Test)2