use of io.cdap.cdap.etl.api.engine.sql.dataset.SQLPullDataset in project cdap by caskdata.
the class BatchSQLEngineAdapter method pullInternal.
/**
* Pull implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param dataset the dataset to pull.
* @return {@link JavaRDD} representing the records contained in this dataset.
* @throws SQLEngineException if the pull process fails.
*/
@SuppressWarnings("unchecked,raw")
private <T> JavaRDD<T> pullInternal(SQLDataset dataset) throws SQLEngineException {
// Create pull operation for this dataset and wait until completion
SQLPullRequest pullRequest = new SQLPullRequest(dataset);
// If so, we will process this request using a producer.
for (PullCapability capability : sqlEngine.getPullCapabilities()) {
SQLDatasetProducer producer = sqlEngine.getProducer(pullRequest, capability);
// If a producer is able to produce records for this pull request, extract the RDD from this request.
if (producer != null) {
RecordCollection recordCollection = producer.produce(dataset);
// If the collection that got generarted is not an instance of a SparkRecordCollection, skip.
if (recordCollection instanceof SparkRecordCollection) {
Schema schema = dataset.getSchema();
return (JavaRDD<T>) ((SparkRecordCollection) recordCollection).getDataFrame().javaRDD().map(r -> DataFrames.fromRow((Row) r, schema));
}
}
}
// If no capabilities could be used to produce records, proceed using the Pull Provider.
SQLPullDataset<StructuredRecord, ?, ?> sqlPullDataset = sqlEngine.getPullProvider(pullRequest);
// Run operation to read from the InputFormatProvider supplied by this operation.
ClassLoader classLoader = Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), getClass().getClassLoader());
JavaPairRDD pairRDD = RDDUtils.readUsingInputFormat(jsc, sqlPullDataset, classLoader, Object.class, Object.class);
return pairRDD.flatMap(new TransformFromPairFunction(sqlPullDataset.fromKeyValue()));
}
Aggregations