Search in sources :

Example 1 with RecordCollection

use of io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection in project cdap by caskdata.

the class BatchSQLEngineAdapter method pushInternal.

/**
 * Push implementation. This method has blocking calls and should be executed in a separate thread.
 *
 * @param datasetName name of the dataset to push.
 * @param schema      the record schema.
 * @param collection  the collection containing the records to push.
 * @return {@link SQLDataset} instance representing the pushed records.
 * @throws SQLEngineException if the push operation fails.
 */
@SuppressWarnings("unchecked")
public SQLDataset pushInternal(String datasetName, Schema schema, SparkCollection<?> collection) throws SQLEngineException {
    // Create push request
    SQLPushRequest pushRequest = new SQLPushRequest(datasetName, schema);
    // If so, we will process this request using a consumer.
    for (PushCapability capability : sqlEngine.getPushCapabilities()) {
        SQLDatasetConsumer consumer = sqlEngine.getConsumer(pushRequest, capability);
        // If a consumer is able to consume this request, we delegate the execution to the consumer.
        if (consumer != null) {
            StructType sparkSchema = DataFrames.toDataType(schema);
            JavaRDD<Row> rowRDD = ((JavaRDD<StructuredRecord>) collection.getUnderlying()).map(r -> DataFrames.toRow(r, sparkSchema));
            Dataset<Row> ds = sqlContext.createDataFrame(rowRDD, sparkSchema);
            RecordCollection recordCollection = new SparkRecordCollectionImpl(ds);
            return consumer.consume(recordCollection);
        }
    }
    // If no capabilities could be used to produce records, proceed using the Push Provider.
    SQLPushDataset<StructuredRecord, ?, ?> pushDataset = sqlEngine.getPushProvider(pushRequest);
    // Write records using the Push provider.
    JavaPairRDD<?, ?> pairRdd = ((JavaRDD) collection.getUnderlying()).flatMapToPair(new TransformToPairFunction<>(pushDataset.toKeyValue()));
    RDDUtils.saveUsingOutputFormat(pushDataset, pairRdd);
    return pushDataset;
}
Also used : StructType(org.apache.spark.sql.types.StructType) SQLPushRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPushRequest) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) PushCapability(io.cdap.cdap.etl.api.engine.sql.capability.PushCapability) SQLDatasetConsumer(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetConsumer) RecordCollection(io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection) SparkRecordCollection(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollection) SparkRecordCollectionImpl(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl) Row(org.apache.spark.sql.Row)

Example 2 with RecordCollection

use of io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection in project cdap by caskdata.

the class BatchSQLEngineAdapter method pullInternal.

/**
 * Pull implementation. This method has blocking calls and should be executed in a separate thread.
 *
 * @param dataset the dataset to pull.
 * @return {@link JavaRDD} representing the records contained in this dataset.
 * @throws SQLEngineException if the pull process fails.
 */
@SuppressWarnings("unchecked,raw")
private <T> JavaRDD<T> pullInternal(SQLDataset dataset) throws SQLEngineException {
    // Create pull operation for this dataset and wait until completion
    SQLPullRequest pullRequest = new SQLPullRequest(dataset);
    // If so, we will process this request using a producer.
    for (PullCapability capability : sqlEngine.getPullCapabilities()) {
        SQLDatasetProducer producer = sqlEngine.getProducer(pullRequest, capability);
        // If a producer is able to produce records for this pull request, extract the RDD from this request.
        if (producer != null) {
            RecordCollection recordCollection = producer.produce(dataset);
            // If the collection that got generarted is not an instance of a SparkRecordCollection, skip.
            if (recordCollection instanceof SparkRecordCollection) {
                Schema schema = dataset.getSchema();
                return (JavaRDD<T>) ((SparkRecordCollection) recordCollection).getDataFrame().javaRDD().map(r -> DataFrames.fromRow((Row) r, schema));
            }
        }
    }
    // If no capabilities could be used to produce records, proceed using the Pull Provider.
    SQLPullDataset<StructuredRecord, ?, ?> sqlPullDataset = sqlEngine.getPullProvider(pullRequest);
    // Run operation to read from the InputFormatProvider supplied by this operation.
    ClassLoader classLoader = Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), getClass().getClassLoader());
    JavaPairRDD pairRDD = RDDUtils.readUsingInputFormat(jsc, sqlPullDataset, classLoader, Object.class, Object.class);
    return pairRDD.flatMap(new TransformFromPairFunction(sqlPullDataset.fromKeyValue()));
}
Also used : RelationalTransform(io.cdap.cdap.etl.api.relational.RelationalTransform) SQLTransformRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLTransformRequest) DataFrames(io.cdap.cdap.api.spark.sql.DataFrames) Relation(io.cdap.cdap.etl.api.relational.Relation) SQLPullRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPullRequest) LoggerFactory(org.slf4j.LoggerFactory) Constants(io.cdap.cdap.etl.common.Constants) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) SQLEngineJob(io.cdap.cdap.etl.engine.SQLEngineJob) SQLRelationDefinition(io.cdap.cdap.etl.api.engine.sql.request.SQLRelationDefinition) PullCapability(io.cdap.cdap.etl.api.engine.sql.capability.PullCapability) JavaSparkExecutionContext(io.cdap.cdap.api.spark.JavaSparkExecutionContext) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) Map(java.util.Map) Objects(com.google.common.base.Objects) SQLPullDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLPullDataset) StructType(org.apache.spark.sql.types.StructType) SQLJoinDefinition(io.cdap.cdap.etl.api.engine.sql.request.SQLJoinDefinition) Threads(org.apache.twill.common.Threads) CancellationException(java.util.concurrent.CancellationException) Engine(io.cdap.cdap.etl.api.relational.Engine) Collection(java.util.Collection) SQLTransformDefinition(io.cdap.cdap.etl.api.engine.sql.request.SQLTransformDefinition) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) Metrics(io.cdap.cdap.api.metrics.Metrics) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) RecordCollection(io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection) SQLPushDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLPushDataset) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SparkCollection(io.cdap.cdap.etl.spark.SparkCollection) List(java.util.List) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) SQLDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset) SQLWriteRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLWriteRequest) PushCapability(io.cdap.cdap.etl.api.engine.sql.capability.PushCapability) Optional(java.util.Optional) TransformToPairFunction(io.cdap.cdap.etl.spark.function.TransformToPairFunction) SparkRecordCollection(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollection) TransformFromPairFunction(io.cdap.cdap.etl.spark.function.TransformFromPairFunction) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SQLPushRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPushRequest) SQLDatasetConsumer(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetConsumer) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) SQLEngineJobKey(io.cdap.cdap.etl.engine.SQLEngineJobKey) SQLEngineWriteJobKey(io.cdap.cdap.etl.engine.SQLEngineWriteJobKey) HashSet(java.util.HashSet) SQLEngineJobType(io.cdap.cdap.etl.engine.SQLEngineJobType) SQLDatasetProducer(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetProducer) JavaRDD(org.apache.spark.api.java.JavaRDD) ExecutorService(java.util.concurrent.ExecutorService) Nullable(javax.annotation.Nullable) SQLWriteResult(io.cdap.cdap.etl.api.engine.sql.request.SQLWriteResult) Logger(org.slf4j.Logger) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) SQLEngineException(io.cdap.cdap.etl.api.engine.sql.SQLEngineException) SQLJoinRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLJoinRequest) SQLContext(org.apache.spark.sql.SQLContext) Row(org.apache.spark.sql.Row) Schema(io.cdap.cdap.api.data.schema.Schema) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Closeable(java.io.Closeable) SQLEngineOutput(io.cdap.cdap.etl.api.engine.sql.SQLEngineOutput) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) Collections(java.util.Collections) SparkRecordCollectionImpl(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl) SQLDatasetProducer(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetProducer) PullCapability(io.cdap.cdap.etl.api.engine.sql.capability.PullCapability) Schema(io.cdap.cdap.api.data.schema.Schema) TransformFromPairFunction(io.cdap.cdap.etl.spark.function.TransformFromPairFunction) SparkRecordCollection(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollection) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) SQLPullRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPullRequest) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RecordCollection(io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection) SparkRecordCollection(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollection)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)2 PushCapability (io.cdap.cdap.etl.api.engine.sql.capability.PushCapability)2 RecordCollection (io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection)2 SQLDatasetConsumer (io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetConsumer)2 SQLPushRequest (io.cdap.cdap.etl.api.engine.sql.request.SQLPushRequest)2 SparkRecordCollection (io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollection)2 SparkRecordCollectionImpl (io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl)2 Objects (com.google.common.base.Objects)1 Schema (io.cdap.cdap.api.data.schema.Schema)1 Metrics (io.cdap.cdap.api.metrics.Metrics)1 JavaSparkExecutionContext (io.cdap.cdap.api.spark.JavaSparkExecutionContext)1 DataFrames (io.cdap.cdap.api.spark.sql.DataFrames)1 StageMetrics (io.cdap.cdap.etl.api.StageMetrics)1 SQLEngine (io.cdap.cdap.etl.api.engine.sql.SQLEngine)1 SQLEngineException (io.cdap.cdap.etl.api.engine.sql.SQLEngineException)1 SQLEngineOutput (io.cdap.cdap.etl.api.engine.sql.SQLEngineOutput)1 PullCapability (io.cdap.cdap.etl.api.engine.sql.capability.PullCapability)1 SQLDataset (io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset)1 SQLDatasetProducer (io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetProducer)1 SQLPullDataset (io.cdap.cdap.etl.api.engine.sql.dataset.SQLPullDataset)1