use of io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl in project cdap by caskdata.
the class BatchSQLEngineAdapter method pushInternal.
/**
* Push implementation. This method has blocking calls and should be executed in a separate thread.
*
* @param datasetName name of the dataset to push.
* @param schema the record schema.
* @param collection the collection containing the records to push.
* @return {@link SQLDataset} instance representing the pushed records.
* @throws SQLEngineException if the push operation fails.
*/
@SuppressWarnings("unchecked")
public SQLDataset pushInternal(String datasetName, Schema schema, SparkCollection<?> collection) throws SQLEngineException {
// Create push request
SQLPushRequest pushRequest = new SQLPushRequest(datasetName, schema);
// If so, we will process this request using a consumer.
for (PushCapability capability : sqlEngine.getPushCapabilities()) {
SQLDatasetConsumer consumer = sqlEngine.getConsumer(pushRequest, capability);
// If a consumer is able to consume this request, we delegate the execution to the consumer.
if (consumer != null) {
StructType sparkSchema = DataFrames.toDataType(schema);
JavaRDD<Row> rowRDD = ((JavaRDD<StructuredRecord>) collection.getUnderlying()).map(r -> DataFrames.toRow(r, sparkSchema));
Dataset<Row> ds = sqlContext.createDataFrame(rowRDD, sparkSchema);
RecordCollection recordCollection = new SparkRecordCollectionImpl(ds);
return consumer.consume(recordCollection);
}
}
// If no capabilities could be used to produce records, proceed using the Push Provider.
SQLPushDataset<StructuredRecord, ?, ?> pushDataset = sqlEngine.getPushProvider(pushRequest);
// Write records using the Push provider.
JavaPairRDD<?, ?> pairRdd = ((JavaRDD) collection.getUnderlying()).flatMapToPair(new TransformToPairFunction<>(pushDataset.toKeyValue()));
RDDUtils.saveUsingOutputFormat(pushDataset, pairRdd);
return pushDataset;
}
use of io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl in project cdap by caskdata.
the class MockPullProducer method produce.
@Override
public RecordCollection produce(SQLDataset dataset) {
// Create a spark session and write RDD as JSON
TypeToken<HashSet<StructuredRecord>> typeToken = new TypeToken<HashSet<StructuredRecord>>() {
};
Type setOfStructuredRecordType = typeToken.getType();
// Read records from JSON and adjust data types
Set<StructuredRecord> jsonRecords = GSON.fromJson(expected, setOfStructuredRecordType);
Set<StructuredRecord> records = new HashSet<>();
for (StructuredRecord jsonRecord : jsonRecords) {
records.add(transform(jsonRecord, jsonRecord.getSchema()));
}
// Build RDD and generate a new Recrd Collection.
SparkContext sc = SparkContext.getOrCreate();
JavaSparkContext jsc = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate());
SQLContext sqlContext = new SQLContext(sc);
StructType sparkSchema = DataFrames.toDataType(this.datasetDescription.getSchema());
JavaRDD<Row> rdd = jsc.parallelize(new ArrayList<>(records)).map(sr -> DataFrames.toRow(sr, sparkSchema));
Dataset<Row> ds = sqlContext.createDataFrame(rdd.rdd(), sparkSchema);
return new SparkRecordCollectionImpl(ds);
}
Aggregations