use of io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection in project cdap by caskdata.
the class BatchSQLEngineAdapter method pushInternal.
/**
* Push implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param datasetName name of the dataset to push.
* @param schema the record schema.
* @param collection the collection containing the records to push.
* @return {@link SQLDataset} instance representing the pushed records.
* @throws SQLEngineException if the push operation fails.
*/
@SuppressWarnings("unchecked")
public SQLDataset pushInternal(String datasetName, Schema schema, SparkCollection<?> collection) throws SQLEngineException {
// Create push request
SQLPushRequest pushRequest = new SQLPushRequest(datasetName, schema);
// If so, we will process this request using a consumer.
for (PushCapability capability : sqlEngine.getPushCapabilities()) {
SQLDatasetConsumer consumer = sqlEngine.getConsumer(pushRequest, capability);
// If a consumer is able to consume this request, we delegate the execution to the consumer.
if (consumer != null) {
StructType sparkSchema = DataFrames.toDataType(schema);
JavaRDD<Row> rowRDD = ((JavaRDD<StructuredRecord>) collection.getUnderlying()).map(r -> DataFrames.toRow(r, sparkSchema));
Dataset<Row> ds = sqlContext.createDataFrame(rowRDD, sparkSchema);
RecordCollection recordCollection = new SparkRecordCollectionImpl(ds);
return consumer.consume(recordCollection);
}
}
// If no capabilities could be used to produce records, proceed using the Push Provider.
SQLPushDataset<StructuredRecord, ?, ?> pushDataset = sqlEngine.getPushProvider(pushRequest);
// Write records using the Push provider.
JavaPairRDD<?, ?> pairRdd = ((JavaRDD) collection.getUnderlying()).flatMapToPair(new TransformToPairFunction<>(pushDataset.toKeyValue()));
RDDUtils.saveUsingOutputFormat(pushDataset, pairRdd);
return pushDataset;
}
use of io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection in project cdap by caskdata.
the class BatchSQLEngineAdapter method pullInternal.
/**
* Pull implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param dataset the dataset to pull.
* @return {@link JavaRDD} representing the records contained in this dataset.
* @throws SQLEngineException if the pull process fails.
*/
@SuppressWarnings("unchecked,raw")
private <T> JavaRDD<T> pullInternal(SQLDataset dataset) throws SQLEngineException {
// Create pull operation for this dataset and wait until completion
SQLPullRequest pullRequest = new SQLPullRequest(dataset);
// If so, we will process this request using a producer.
for (PullCapability capability : sqlEngine.getPullCapabilities()) {
SQLDatasetProducer producer = sqlEngine.getProducer(pullRequest, capability);
// If a producer is able to produce records for this pull request, extract the RDD from this request.
if (producer != null) {
RecordCollection recordCollection = producer.produce(dataset);
// If the collection that got generarted is not an instance of a SparkRecordCollection, skip.
if (recordCollection instanceof SparkRecordCollection) {
Schema schema = dataset.getSchema();
return (JavaRDD<T>) ((SparkRecordCollection) recordCollection).getDataFrame().javaRDD().map(r -> DataFrames.fromRow((Row) r, schema));
}
}
}
// If no capabilities could be used to produce records, proceed using the Pull Provider.
SQLPullDataset<StructuredRecord, ?, ?> sqlPullDataset = sqlEngine.getPullProvider(pullRequest);
// Run operation to read from the InputFormatProvider supplied by this operation.
ClassLoader classLoader = Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), getClass().getClassLoader());
JavaPairRDD pairRDD = RDDUtils.readUsingInputFormat(jsc, sqlPullDataset, classLoader, Object.class, Object.class);
return pairRDD.flatMap(new TransformFromPairFunction(sqlPullDataset.fromKeyValue()));
}
Aggregations