use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by cdapio.
the class BatchSQLEngineAdapter method getDatasetForStage.
/**
* Function used to fetch the dataset for an input stage.
*
* @param stageName
* @return
*/
private SQLDataset getDatasetForStage(String stageName) {
// Wait for the previous push or execute jobs to complete
SQLEngineJobKey pushJobKey = new SQLEngineJobKey(stageName, SQLEngineJobType.PUSH);
SQLEngineJobKey execJobKey = new SQLEngineJobKey(stageName, SQLEngineJobType.EXECUTE);
if (jobs.containsKey(pushJobKey)) {
SQLEngineJob<SQLDataset> job = (SQLEngineJob<SQLDataset>) jobs.get(pushJobKey);
waitForJobAndThrowException(job);
return job.waitFor();
} else if (jobs.containsKey(execJobKey)) {
SQLEngineJob<SQLDataset> job = (SQLEngineJob<SQLDataset>) jobs.get(execJobKey);
waitForJobAndThrowException(job);
return job.waitFor();
} else {
throw new IllegalArgumentException("No SQL Engine job exists for stage " + stageName);
}
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by cdapio.
the class BatchSQLEngineAdapter method tryRelationalTransform.
/**
* This method is called when engine is present and is willing to try performing a relational transform.
*
* @param stageSpec stage specification
* @param transform transform plugin
* @param input input collections
* @return resulting collection or empty optional if tranform can't be done with this engine
*/
public Optional<SQLEngineJob<SQLDataset>> tryRelationalTransform(StageSpec stageSpec, RelationalTransform transform, Map<String, SparkCollection<Object>> input) {
String stageName = stageSpec.getName();
Map<String, Relation> inputRelations = input.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> sqlEngine.getRelation(new SQLRelationDefinition(e.getKey(), stageSpec.getInputSchemas().get(e.getKey())))));
BasicRelationalTransformContext pluginContext = new BasicRelationalTransformContext(getSQLRelationalEngine(), inputRelations, stageSpec.getInputSchemas(), stageSpec.getOutputSchema());
if (!transform.transform(pluginContext)) {
// Plugin was not able to do relational tranform with this engine
return Optional.empty();
}
if (pluginContext.getOutputRelation() == null) {
// Plugin said that tranformation was success but failed to set output
throw new IllegalStateException("Plugin " + transform + " did not produce a relational output");
}
if (!pluginContext.getOutputRelation().isValid()) {
// An output is set to invalid relation, probably some of transforms are not supported by an engine
return Optional.empty();
}
// Ensure input and output schemas for this stage are supported by the engine
if (stageSpec.getInputSchemas().values().stream().anyMatch(s -> !sqlEngine.supportsInputSchema(s))) {
return Optional.empty();
}
if (!sqlEngine.supportsOutputSchema(stageSpec.getOutputSchema())) {
return Optional.empty();
}
// Validate transformation definition with engine
SQLTransformDefinition transformDefinition = new SQLTransformDefinition(stageName, pluginContext.getOutputRelation(), stageSpec.getOutputSchema(), Collections.emptyMap(), Collections.emptyMap());
if (!sqlEngine.canTransform(transformDefinition)) {
return Optional.empty();
}
return Optional.of(runJob(stageSpec.getName(), SQLEngineJobType.EXECUTE, () -> {
// Push all stages that need to be pushed to execute this aggregation
input.forEach((name, collection) -> {
if (!exists(name)) {
push(name, stageSpec.getInputSchemas().get(name), collection);
}
});
// Initialize metrics collector
DefaultStageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
StageStatisticsCollector statisticsCollector = statsCollectors.get(stageName);
// Collect input datasets and execute transformation
Map<String, SQLDataset> inputDatasets = input.keySet().stream().collect(Collectors.toMap(Function.identity(), this::getDatasetForStage));
// Count input records
for (SQLDataset inputDataset : inputDatasets.values()) {
countRecordsIn(inputDataset, statisticsCollector, stageMetrics);
}
// Execute transform
SQLTransformRequest sqlContext = new SQLTransformRequest(inputDatasets, stageSpec.getName(), pluginContext.getOutputRelation(), stageSpec.getOutputSchema());
SQLDataset transformed = sqlEngine.transform(sqlContext);
// Count output records
countRecordsOut(transformed, statisticsCollector, stageMetrics);
return transformed;
}));
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method joinInternal.
/**
* Join implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param joinRequest the Join Request
* @return
* @throws SQLEngineException if any of the preceding jobs fails.
*/
private SQLDataset joinInternal(SQLJoinRequest joinRequest) throws SQLEngineException {
String datasetName = joinRequest.getDatasetName();
DefaultStageMetrics stageMetrics = new DefaultStageMetrics(metrics, datasetName);
StageStatisticsCollector statisticsCollector = statsCollectors.get(datasetName);
// Count input metrics for each of the preceding stages.
for (SQLDataset inputDataset : joinRequest.getInputDatasets()) {
countRecordsIn(inputDataset, statisticsCollector, stageMetrics);
}
// Execute Join job.
SQLDataset joinDataset = (SQLDataset) sqlEngine.join(joinRequest);
// Count output rows and complete future.
countRecordsOut(joinDataset, statisticsCollector, stageMetrics);
return joinDataset;
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method pullInternal.
/**
* Pull implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param dataset the dataset to pull.
* @return {@link JavaRDD} representing the records contained in this dataset.
* @throws SQLEngineException if the pull process fails.
*/
@SuppressWarnings("unchecked,raw")
private <T> JavaRDD<T> pullInternal(SQLDataset dataset) throws SQLEngineException {
// Create pull operation for this dataset and wait until completion
SQLPullRequest pullRequest = new SQLPullRequest(dataset);
// If so, we will process this request using a producer.
for (PullCapability capability : sqlEngine.getPullCapabilities()) {
SQLDatasetProducer producer = sqlEngine.getProducer(pullRequest, capability);
// If a producer is able to produce records for this pull request, extract the RDD from this request.
if (producer != null) {
RecordCollection recordCollection = producer.produce(dataset);
// If the collection that got generarted is not an instance of a SparkRecordCollection, skip.
if (recordCollection instanceof SparkRecordCollection) {
Schema schema = dataset.getSchema();
return (JavaRDD<T>) ((SparkRecordCollection) recordCollection).getDataFrame().javaRDD().map(r -> DataFrames.fromRow((Row) r, schema));
}
}
}
// If no capabilities could be used to produce records, proceed using the Pull Provider.
SQLPullDataset<StructuredRecord, ?, ?> sqlPullDataset = sqlEngine.getPullProvider(pullRequest);
// Run operation to read from the InputFormatProvider supplied by this operation.
ClassLoader classLoader = Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), getClass().getClassLoader());
JavaPairRDD pairRDD = RDDUtils.readUsingInputFormat(jsc, sqlPullDataset, classLoader, Object.class, Object.class);
return pairRDD.flatMap(new TransformFromPairFunction(sqlPullDataset.fromKeyValue()));
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method join.
/**
* Executes a Join operation in the SQL engine
*
* @param datasetName the dataset name to use to store the result of the join operation
* @param joinDefinition the Join Definition
* @return Job representing this join operation
*/
@SuppressWarnings("unchecked,raw")
public SQLEngineJob<SQLDataset> join(String datasetName, JoinDefinition joinDefinition) {
return runJob(datasetName, SQLEngineJobType.EXECUTE, () -> {
Collection<SQLDataset> inputDatasets = getJoinInputDatasets(joinDefinition);
SQLJoinRequest joinRequest = new SQLJoinRequest(datasetName, joinDefinition, inputDatasets);
if (!sqlEngine.canJoin(joinRequest)) {
throw new IllegalArgumentException("Unable to execute this join in the SQL engine");
}
return joinInternal(joinRequest);
});
}
Aggregations