use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSparkPipelineDriver method handleAutoJoin.
@Override
@SuppressWarnings("unchecked")
protected SparkCollection<Object> handleAutoJoin(String stageName, JoinDefinition joinDefinition, Map<String, SparkCollection<Object>> inputDataCollections, @Nullable Integer numPartitions) {
if (sqlEngineAdapter != null && canJoinOnSQLEngine(stageName, joinDefinition, inputDataCollections)) {
// collections representing data that has been pushed to the SQL engine.
for (JoinStage joinStage : joinDefinition.getStages()) {
String joinStageName = joinStage.getStageName();
// If the input collection is already a SQL Engine collection, there's no need to push.
if (inputDataCollections.get(joinStageName) instanceof SQLBackedCollection) {
continue;
}
SparkCollection<Object> collection = inputDataCollections.get(joinStage.getStageName());
SQLEngineJob<SQLDataset> pushJob = sqlEngineAdapter.push(joinStageName, joinStage.getSchema(), collection);
inputDataCollections.put(joinStageName, new SQLEngineCollection<>(sec, functionCacheFactory, jsc, new SQLContext(jsc), datasetContext, sinkFactory, collection, joinStageName, sqlEngineAdapter, pushJob));
}
}
return super.handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method push.
/**
* Creates a new job tu push a SparkCollection into the SQL engine.
*
* @param datasetName the name of the dataset to push
* @param schema the schema for this dataset
* @param collection the Spark collection containing the dataset to push
* @return Job representing this Push operation.
*/
@SuppressWarnings("unchecked,raw")
protected SQLEngineJob<SQLDataset> push(String datasetName, Schema schema, SparkCollection<?> collection) {
// If this job already exists, return the existing instance.
SQLEngineJobKey jobKey = new SQLEngineJobKey(datasetName, SQLEngineJobType.PUSH);
if (jobs.containsKey(jobKey)) {
return (SQLEngineJob<SQLDataset>) jobs.get(jobKey);
}
CompletableFuture<SQLDataset> future = new CompletableFuture<>();
Runnable pushTask = () -> {
try {
LOG.debug("Starting push for dataset '{}'", datasetName);
SQLDataset result = pushInternal(datasetName, schema, collection);
LOG.debug("Completed push for dataset '{}'", datasetName);
future.complete(result);
} catch (Throwable t) {
future.completeExceptionally(t);
}
};
executorService.submit(pushTask);
SQLEngineJob<SQLDataset> job = new SQLEngineJob<>(jobKey, future);
jobs.put(jobKey, job);
return job;
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method joinInternal.
/**
* Join implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param joinRequest the Join Request
* @return
* @throws SQLEngineException if any of the preceding jobs fails.
*/
private SQLDataset joinInternal(SQLJoinRequest joinRequest) throws SQLEngineException {
String datasetName = joinRequest.getDatasetName();
DefaultStageMetrics stageMetrics = new DefaultStageMetrics(metrics, datasetName);
StageStatisticsCollector statisticsCollector = statsCollectors.get(datasetName);
// Count input metrics for each of the preceding stages.
for (SQLDataset inputDataset : joinRequest.getInputDatasets()) {
countRecordsIn(inputDataset, statisticsCollector, stageMetrics);
}
// Execute Join job.
SQLDataset joinDataset = (SQLDataset) sqlEngine.join(joinRequest);
// Count output rows and complete future.
countRecordsOut(joinDataset, statisticsCollector, stageMetrics);
return joinDataset;
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method pullInternal.
/**
* Pull implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param dataset the dataset to pull.
* @return {@link JavaRDD} representing the records contained in this dataset.
* @throws SQLEngineException if the pull process fails.
*/
@SuppressWarnings("unchecked,raw")
private <T> JavaRDD<T> pullInternal(SQLDataset dataset) throws SQLEngineException {
// Create pull operation for this dataset and wait until completion
SQLPullRequest pullRequest = new SQLPullRequest(dataset);
// If so, we will process this request using a producer.
for (PullCapability capability : sqlEngine.getPullCapabilities()) {
SQLDatasetProducer producer = sqlEngine.getProducer(pullRequest, capability);
// If a producer is able to produce records for this pull request, extract the RDD from this request.
if (producer != null) {
RecordCollection recordCollection = producer.produce(dataset);
// If the collection that got generarted is not an instance of a SparkRecordCollection, skip.
if (recordCollection instanceof SparkRecordCollection) {
Schema schema = dataset.getSchema();
return (JavaRDD<T>) ((SparkRecordCollection) recordCollection).getDataFrame().javaRDD().map(r -> DataFrames.fromRow((Row) r, schema));
}
}
}
// If no capabilities could be used to produce records, proceed using the Pull Provider.
SQLPullDataset<StructuredRecord, ?, ?> sqlPullDataset = sqlEngine.getPullProvider(pullRequest);
// Run operation to read from the InputFormatProvider supplied by this operation.
ClassLoader classLoader = Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), getClass().getClassLoader());
JavaPairRDD pairRDD = RDDUtils.readUsingInputFormat(jsc, sqlPullDataset, classLoader, Object.class, Object.class);
return pairRDD.flatMap(new TransformFromPairFunction(sqlPullDataset.fromKeyValue()));
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method join.
/**
* Executes a Join operation in the SQL engine
*
* @param datasetName the dataset name to use to store the result of the join operation
* @param joinDefinition the Join Definition
* @return Job representing this join operation
*/
@SuppressWarnings("unchecked,raw")
public SQLEngineJob<SQLDataset> join(String datasetName, JoinDefinition joinDefinition) {
return runJob(datasetName, SQLEngineJobType.EXECUTE, () -> {
Collection<SQLDataset> inputDatasets = getJoinInputDatasets(joinDefinition);
SQLJoinRequest joinRequest = new SQLJoinRequest(datasetName, joinDefinition, inputDatasets);
if (!sqlEngine.canJoin(joinRequest)) {
throw new IllegalArgumentException("Unable to execute this join in the SQL engine");
}
return joinInternal(joinRequest);
});
}
Aggregations