use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class TransformService method createShellTask.
/**
* Creates a new Spark shell transformation.
*/
@Nonnull
private DataSet createShellTask(@Nonnull final TransformRequest request) throws ScriptException {
log.entry(request);
// Build bindings list
final List<NamedParam> bindings = new ArrayList<>();
bindings.add(new NamedParamClass("sparkContextService", SparkContextService.class.getName(), sparkContextService));
if (request.getDatasources() != null && !request.getDatasources().isEmpty()) {
if (datasourceProviderFactory != null) {
final DatasourceProvider datasourceProvider = datasourceProviderFactory.getDatasourceProvider(request.getDatasources());
bindings.add(new NamedParamClass("datasourceProvider", DatasourceProvider.class.getName() + "[org.apache.spark.sql.DataFrame]", datasourceProvider));
} else {
throw log.throwing(new ScriptException("Script cannot be executed because no data source provider factory is available."));
}
}
// Ensure SessionState is valid
if (SessionState.get() == null && sessionState != null) {
SessionState.setCurrentSessionState(sessionState);
}
// Execute script
final Object result;
try {
result = this.engine.eval(toScript(request), bindings);
} catch (final Exception cause) {
throw log.throwing(new ScriptException(cause));
}
if (result instanceof DataSet) {
return log.exit((DataSet) result);
} else {
throw log.throwing(new IllegalStateException("Unexpected script result type: " + (result != null ? result.getClass() : null)));
}
}
use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class TransformService method saveShell.
/**
* Executes and saves a Spark shell request.
*/
@Nonnull
public SaveResponse saveShell(@Nonnull final String id, @Nonnull final SaveRequest save) throws ScriptException {
log.entry(id, save);
final DataSet dataSet = createShellTask(getTransformRequest(id));
final SaveResponse response = submitSaveJob(createSaveTask(save, new ShellTransformStage(dataSet)));
return log.exit(response);
}
use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class StandardDataValidator method saveValidToTable.
@Override
public void saveValidToTable(@Nonnull final String databaseName, @Nonnull final String sourceTableName, @Nonnull final String targetTableName, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
// Return a new rdd based for Valid Results
// noinspection serial
JavaRDD<CleansedRowResult> validResultRDD = result.getCleansedRowResultRDD().filter(new Function<CleansedRowResult, Boolean>() {
@Override
public Boolean call(CleansedRowResult cleansedRowResult) throws Exception {
return cleansedRowResult.isRowValid();
}
});
// Write out the valid records (dropping the two columns)
final StructType feedTableSchema = scs.toDataSet(hiveContext, HiveUtils.quoteIdentifier(databaseName, sourceTableName)).schema();
StructType validTableSchema = scs.toDataSet(hiveContext, HiveUtils.quoteIdentifier(databaseName, targetTableName)).schema();
DataSet validDataFrame = getRows(validResultRDD, ModifiedSchema.getValidTableSchema(feedTableSchema.fields(), validTableSchema.fields(), result.getPolicies()), hiveContext);
validDataFrame = validDataFrame.drop(REJECT_REASON_COL).toDF();
// Remove the columns from _valid that dont exist in the validTableName
writeToTargetTable(validDataFrame, databaseName, targetTableName, hiveContext);
log.info("wrote values to the valid Table {}", targetTableName);
}
use of com.thinkbiganalytics.spark.DataSet in project kylo by Teradata.
the class StandardDataValidator method saveProfileToTable.
@Override
public void saveProfileToTable(@Nonnull final String databaseName, @Nonnull final String tableName, @Nonnull final String partition, @Nonnull final DataValidatorResult result, @Nonnull final HiveContext hiveContext) {
try {
// Create a temporary table that can be used to copy data from. Writing directly to the partition from a spark dataframe doesn't work.
final String tempTable = tableName + "_" + System.currentTimeMillis();
// Refactor this into something common with profile table
@SuppressWarnings("squid:S2095") final JavaRDD<OutputRow> statsRDD = JavaSparkContext.fromSparkContext(hiveContext.sparkContext()).parallelize(getProfileStats(result));
final DataSet df = scs.toDataSet(hiveContext, statsRDD, OutputRow.class);
df.registerTempTable(tempTable);
final String insertSQL = "INSERT OVERWRITE TABLE " + HiveUtils.quoteIdentifier(databaseName, tableName) + " PARTITION (processing_dttm='" + partition + "')" + " SELECT columnname, metrictype, metricvalue FROM " + HiveUtils.quoteIdentifier(tempTable);
log.info("Writing profile stats {}", insertSQL);
scs.sql(hiveContext, insertSQL);
} catch (final Exception e) {
log.error("Failed to insert validation stats", e);
throw Throwables.propagate(e);
}
}
Aggregations