Search in sources :

Example 81 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class RDDCollection method join.

@SuppressWarnings("unchecked")
@Override
public SparkCollection<T> join(JoinRequest joinRequest) {
    Map<String, Dataset> collections = new HashMap<>();
    String stageName = joinRequest.getStageName();
    Function<StructuredRecord, StructuredRecord> recordsInCounter = new CountingFunction<>(stageName, sec.getMetrics(), Constants.Metrics.RECORDS_IN, sec.getDataTracer(stageName));
    StructType leftSparkSchema = DataFrames.toDataType(joinRequest.getLeftSchema());
    Dataset<Row> left = toDataset(((JavaRDD<StructuredRecord>) rdd).map(recordsInCounter), leftSparkSchema);
    collections.put(joinRequest.getLeftStage(), left);
    List<Column> leftJoinColumns = joinRequest.getLeftKey().stream().map(left::col).collect(Collectors.toList());
    /*
        This flag keeps track of whether there is at least one required stage in the join.
        This is needed in case there is a join like:

        A (optional), B (required), C (optional), D (required)

        The correct thing to do here is:

        1. A right outer join B as TMP1
        2. TMP1 left outer join C as TMP2
        3. TMP2 inner join D

        Join #1 is a straightforward join between 2 sides.
        Join #2 is a left outer because TMP1 becomes 'required', since it uses required input B.
        Join #3 is an inner join even though it contains 2 optional datasets, because 'B' is still required.
     */
    Integer joinPartitions = joinRequest.getNumPartitions();
    boolean seenRequired = joinRequest.isLeftRequired();
    Dataset<Row> joined = left;
    List<List<Column>> listOfListOfLeftCols = new ArrayList<>();
    for (JoinCollection toJoin : joinRequest.getToJoin()) {
        SparkCollection<StructuredRecord> data = (SparkCollection<StructuredRecord>) toJoin.getData();
        StructType sparkSchema = DataFrames.toDataType(toJoin.getSchema());
        Dataset<Row> right = toDataset(((JavaRDD<StructuredRecord>) data.getUnderlying()).map(recordsInCounter), sparkSchema);
        collections.put(toJoin.getStage(), right);
        List<Column> rightJoinColumns = toJoin.getKey().stream().map(right::col).collect(Collectors.toList());
        // UUID for salt column name to avoid name collisions
        String saltColumn = UUID.randomUUID().toString();
        if (joinRequest.isDistributionEnabled()) {
            boolean isLeftStageSkewed = joinRequest.getLeftStage().equals(joinRequest.getDistribution().getSkewedStageName());
            // Apply salt/explode transformations to each Dataset
            if (isLeftStageSkewed) {
                left = saltDataset(left, saltColumn, joinRequest.getDistribution().getDistributionFactor());
                right = explodeDataset(right, saltColumn, joinRequest.getDistribution().getDistributionFactor());
            } else {
                left = explodeDataset(left, saltColumn, joinRequest.getDistribution().getDistributionFactor());
                right = saltDataset(right, saltColumn, joinRequest.getDistribution().getDistributionFactor());
            }
            // Add the salt column to the join key
            leftJoinColumns.add(left.col(saltColumn));
            rightJoinColumns.add(right.col(saltColumn));
            // Updating other values that will be used later in join
            joined = left;
            sparkSchema = sparkSchema.add(saltColumn, DataTypes.IntegerType, false);
            leftSparkSchema = leftSparkSchema.add(saltColumn, DataTypes.IntegerType, false);
        }
        Column joinOn;
        // Making effectively final to use in streams
        List<Column> finalLeftJoinColumns = leftJoinColumns;
        if (seenRequired) {
            joinOn = IntStream.range(0, leftJoinColumns.size()).mapToObj(i -> eq(finalLeftJoinColumns.get(i), rightJoinColumns.get(i), joinRequest.isNullSafe())).reduce((a, b) -> a.and(b)).get();
        } else {
            // For the case when all joins are outer. Collect left keys at each level (each iteration)
            // coalesce these keys at each level and compare with right
            joinOn = IntStream.range(0, leftJoinColumns.size()).mapToObj(i -> {
                collectLeftJoinOnCols(listOfListOfLeftCols, i, finalLeftJoinColumns.get(i));
                return eq(getLeftJoinOnCoalescedColumn(finalLeftJoinColumns.get(i), i, listOfListOfLeftCols), rightJoinColumns.get(i), joinRequest.isNullSafe());
            }).reduce((a, b) -> a.and(b)).get();
        }
        String joinType;
        if (seenRequired && toJoin.isRequired()) {
            joinType = "inner";
        } else if (seenRequired && !toJoin.isRequired()) {
            joinType = "leftouter";
        } else if (!seenRequired && toJoin.isRequired()) {
            joinType = "rightouter";
        } else {
            joinType = "outer";
        }
        seenRequired = seenRequired || toJoin.isRequired();
        if (toJoin.isBroadcast()) {
            right = functions.broadcast(right);
        }
        // we are forced to with spark.cdap.pipeline.aggregate.dataset.partitions.ignore = false
        if (!ignorePartitionsDuringDatasetAggregation && joinPartitions != null && !toJoin.isBroadcast()) {
            List<String> rightKeys = new ArrayList<>(toJoin.getKey());
            List<String> leftKeys = new ArrayList<>(joinRequest.getLeftKey());
            // number of partitions
            if (joinRequest.isDistributionEnabled()) {
                rightKeys.add(saltColumn);
                leftKeys.add(saltColumn);
            }
            right = partitionOnKey(right, rightKeys, joinRequest.isNullSafe(), sparkSchema, joinPartitions);
            // as intermediate joins will already be partitioned on the key
            if (joined == left) {
                joined = partitionOnKey(joined, leftKeys, joinRequest.isNullSafe(), leftSparkSchema, joinPartitions);
            }
        }
        joined = joined.join(right, joinOn, joinType);
        /*
           Additionally if none of the datasets are required until now, which means all of the joines will outer.
           In this case also we need to pass on the join columns as we need to compare using coalesce of all previous
           columns with the right dataset
       */
        if (toJoin.isRequired() || !seenRequired) {
            leftJoinColumns = rightJoinColumns;
        }
    }
    // select and alias fields in the expected order
    List<Column> outputColumns = new ArrayList<>(joinRequest.getFields().size());
    for (JoinField field : joinRequest.getFields()) {
        Column column = collections.get(field.getStageName()).col(field.getFieldName());
        if (field.getAlias() != null) {
            column = column.alias(field.getAlias());
        }
        outputColumns.add(column);
    }
    Seq<Column> outputColumnSeq = JavaConversions.asScalaBuffer(outputColumns).toSeq();
    joined = joined.select(outputColumnSeq);
    Schema outputSchema = joinRequest.getOutputSchema();
    JavaRDD<StructuredRecord> output = joined.javaRDD().map(r -> DataFrames.fromRow(r, outputSchema)).map(new CountingFunction<>(stageName, sec.getMetrics(), Constants.Metrics.RECORDS_OUT, sec.getDataTracer(stageName)));
    return (SparkCollection<T>) wrap(output);
}
Also used : DataType(org.apache.spark.sql.types.DataType) org.apache.spark.sql.functions.coalesce(org.apache.spark.sql.functions.coalesce) Arrays(java.util.Arrays) DataFrames(io.cdap.cdap.api.spark.sql.DataFrames) DatasetAggregationReduceFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationReduceFunction) JoinExpressionRequest(io.cdap.cdap.etl.spark.join.JoinExpressionRequest) PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) LoggerFactory(org.slf4j.LoggerFactory) CountingFunction(io.cdap.cdap.etl.spark.function.CountingFunction) Constants(io.cdap.cdap.etl.common.Constants) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JavaSparkExecutionContext(io.cdap.cdap.api.spark.JavaSparkExecutionContext) DatasetContext(io.cdap.cdap.api.data.DatasetContext) StorageLevel(org.apache.spark.storage.StorageLevel) Map(java.util.Map) MapFunction(org.apache.spark.api.java.function.MapFunction) FunctionCache(io.cdap.cdap.etl.spark.function.FunctionCache) DataTypes(org.apache.spark.sql.types.DataTypes) StructType(org.apache.spark.sql.types.StructType) JoinField(io.cdap.cdap.etl.api.join.JoinField) Seq(scala.collection.Seq) RecordInfo(io.cdap.cdap.etl.common.RecordInfo) UUID(java.util.UUID) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SparkCollection(io.cdap.cdap.etl.spark.SparkCollection) List(java.util.List) JoinRequest(io.cdap.cdap.etl.spark.join.JoinRequest) Encoder(org.apache.spark.sql.Encoder) DatasetAggregationFinalizeFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationFinalizeFunction) Function(org.apache.spark.api.java.function.Function) org.apache.spark.sql.functions(org.apache.spark.sql.functions) IntStream(java.util.stream.IntStream) LiteralsBridge(io.cdap.cdap.etl.spark.plugin.LiteralsBridge) Dataset(org.apache.spark.sql.Dataset) DatasetAggregationAccumulator(io.cdap.cdap.etl.spark.function.DatasetAggregationAccumulator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DatasetAggregationGetKeyFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationGetKeyFunction) JavaRDD(org.apache.spark.api.java.JavaRDD) Nullable(javax.annotation.Nullable) JavaConversions(scala.collection.JavaConversions) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) JoinCollection(io.cdap.cdap.etl.spark.join.JoinCollection) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) Row(org.apache.spark.sql.Row) Schema(io.cdap.cdap.api.data.schema.Schema) Encoders(org.apache.spark.sql.Encoders) org.apache.spark.sql.functions.floor(org.apache.spark.sql.functions.floor) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) CountingFunction(io.cdap.cdap.etl.spark.function.CountingFunction) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Column(org.apache.spark.sql.Column) List(java.util.List) ArrayList(java.util.ArrayList) JoinCollection(io.cdap.cdap.etl.spark.join.JoinCollection) Dataset(org.apache.spark.sql.Dataset) SparkCollection(io.cdap.cdap.etl.spark.SparkCollection) Row(org.apache.spark.sql.Row)

Example 82 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class RDDCollection method join.

@SuppressWarnings("unchecked")
@Override
public SparkCollection<T> join(JoinExpressionRequest joinRequest) {
    Function<StructuredRecord, StructuredRecord> recordsInCounter = new CountingFunction<>(joinRequest.getStageName(), sec.getMetrics(), Constants.Metrics.RECORDS_IN, sec.getDataTracer(joinRequest.getStageName()));
    JoinCollection leftInfo = joinRequest.getLeft();
    StructType leftSchema = DataFrames.toDataType(leftInfo.getSchema());
    Dataset<Row> leftDF = toDataset(((JavaRDD<StructuredRecord>) rdd).map(recordsInCounter), leftSchema);
    JoinCollection rightInfo = joinRequest.getRight();
    SparkCollection<?> rightData = rightInfo.getData();
    StructType rightSchema = DataFrames.toDataType(rightInfo.getSchema());
    Dataset<Row> rightDF = toDataset(((JavaRDD<StructuredRecord>) rightData.getUnderlying()).map(recordsInCounter), rightSchema);
    // if this is not a broadcast join, Spark will reprocess each side multiple times, depending on the number
    // of partitions. If the left side has N partitions and the right side has M partitions,
    // the left side gets reprocessed M times and the right side gets reprocessed N times.
    // Cache the input to prevent confusing metrics and potential source re-reading.
    // this is only necessary for inner joins, since outer joins are automatically changed to
    // BroadcastNestedLoopJoins by Spark
    boolean isInner = joinRequest.getLeft().isRequired() && joinRequest.getRight().isRequired();
    boolean isBroadcast = joinRequest.getLeft().isBroadcast() || joinRequest.getRight().isBroadcast();
    if (isInner && !isBroadcast) {
        leftDF = leftDF.persist(StorageLevel.DISK_ONLY());
        rightDF = rightDF.persist(StorageLevel.DISK_ONLY());
    }
    // register using unique names to avoid collisions.
    String leftId = UUID.randomUUID().toString().replaceAll("-", "");
    String rightId = UUID.randomUUID().toString().replaceAll("-", "");
    leftDF.registerTempTable(leftId);
    rightDF.registerTempTable(rightId);
    /*
        Suppose the join was originally:

          select P.id as id, users.name as username
          from purchases as P join users
          on P.user_id = users.id or P.user_id = 0

        After registering purchases as uuid0 and users as uuid1,
        the query needs to be rewritten to replace the original names with the new generated ids,
        as the query needs to be:

          select P.id as id, uuid1.name as username
          from uuid0 as P join uuid1
          on P.user_id = uuid1.id or P.user_id = 0
     */
    String sql = getSQL(joinRequest.rename(leftId, rightId));
    LOG.debug("Executing join stage {} using SQL: \n{}", joinRequest.getStageName(), sql);
    Dataset<Row> joined = sqlContext.sql(sql);
    Schema outputSchema = joinRequest.getOutputSchema();
    JavaRDD<StructuredRecord> output = joined.javaRDD().map(r -> DataFrames.fromRow(r, outputSchema)).map(new CountingFunction<>(joinRequest.getStageName(), sec.getMetrics(), Constants.Metrics.RECORDS_OUT, sec.getDataTracer(joinRequest.getStageName())));
    return (SparkCollection<T>) wrap(output);
}
Also used : DataType(org.apache.spark.sql.types.DataType) org.apache.spark.sql.functions.coalesce(org.apache.spark.sql.functions.coalesce) Arrays(java.util.Arrays) DataFrames(io.cdap.cdap.api.spark.sql.DataFrames) DatasetAggregationReduceFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationReduceFunction) JoinExpressionRequest(io.cdap.cdap.etl.spark.join.JoinExpressionRequest) PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) LoggerFactory(org.slf4j.LoggerFactory) CountingFunction(io.cdap.cdap.etl.spark.function.CountingFunction) Constants(io.cdap.cdap.etl.common.Constants) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JavaSparkExecutionContext(io.cdap.cdap.api.spark.JavaSparkExecutionContext) DatasetContext(io.cdap.cdap.api.data.DatasetContext) StorageLevel(org.apache.spark.storage.StorageLevel) Map(java.util.Map) MapFunction(org.apache.spark.api.java.function.MapFunction) FunctionCache(io.cdap.cdap.etl.spark.function.FunctionCache) DataTypes(org.apache.spark.sql.types.DataTypes) StructType(org.apache.spark.sql.types.StructType) JoinField(io.cdap.cdap.etl.api.join.JoinField) Seq(scala.collection.Seq) RecordInfo(io.cdap.cdap.etl.common.RecordInfo) UUID(java.util.UUID) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SparkCollection(io.cdap.cdap.etl.spark.SparkCollection) List(java.util.List) JoinRequest(io.cdap.cdap.etl.spark.join.JoinRequest) Encoder(org.apache.spark.sql.Encoder) DatasetAggregationFinalizeFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationFinalizeFunction) Function(org.apache.spark.api.java.function.Function) org.apache.spark.sql.functions(org.apache.spark.sql.functions) IntStream(java.util.stream.IntStream) LiteralsBridge(io.cdap.cdap.etl.spark.plugin.LiteralsBridge) Dataset(org.apache.spark.sql.Dataset) DatasetAggregationAccumulator(io.cdap.cdap.etl.spark.function.DatasetAggregationAccumulator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DatasetAggregationGetKeyFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationGetKeyFunction) JavaRDD(org.apache.spark.api.java.JavaRDD) Nullable(javax.annotation.Nullable) JavaConversions(scala.collection.JavaConversions) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) JoinCollection(io.cdap.cdap.etl.spark.join.JoinCollection) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) Row(org.apache.spark.sql.Row) Schema(io.cdap.cdap.api.data.schema.Schema) Encoders(org.apache.spark.sql.Encoders) org.apache.spark.sql.functions.floor(org.apache.spark.sql.functions.floor) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) StructType(org.apache.spark.sql.types.StructType) Schema(io.cdap.cdap.api.data.schema.Schema) CountingFunction(io.cdap.cdap.etl.spark.function.CountingFunction) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) SparkCollection(io.cdap.cdap.etl.spark.SparkCollection) Row(org.apache.spark.sql.Row) JoinCollection(io.cdap.cdap.etl.spark.join.JoinCollection)

Example 83 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class BatchSQLEngineAdapter method pushInternal.

/**
 * Push implementation. This method has blocking calls and should be executed in a separate thread.
 *
 * @param datasetName name of the dataset to push.
 * @param schema      the record schema.
 * @param collection  the collection containing the records to push.
 * @return {@link SQLDataset} instance representing the pushed records.
 * @throws SQLEngineException if the push operation fails.
 */
@SuppressWarnings("unchecked")
public SQLDataset pushInternal(String datasetName, Schema schema, SparkCollection<?> collection) throws SQLEngineException {
    // Create push request
    SQLPushRequest pushRequest = new SQLPushRequest(datasetName, schema);
    // If so, we will process this request using a consumer.
    for (PushCapability capability : sqlEngine.getPushCapabilities()) {
        SQLDatasetConsumer consumer = sqlEngine.getConsumer(pushRequest, capability);
        // If a consumer is able to consume this request, we delegate the execution to the consumer.
        if (consumer != null) {
            StructType sparkSchema = DataFrames.toDataType(schema);
            JavaRDD<Row> rowRDD = ((JavaRDD<StructuredRecord>) collection.getUnderlying()).map(r -> DataFrames.toRow(r, sparkSchema));
            Dataset<Row> ds = sqlContext.createDataFrame(rowRDD, sparkSchema);
            RecordCollection recordCollection = new SparkRecordCollectionImpl(ds);
            return consumer.consume(recordCollection);
        }
    }
    // If no capabilities could be used to produce records, proceed using the Push Provider.
    SQLPushDataset<StructuredRecord, ?, ?> pushDataset = sqlEngine.getPushProvider(pushRequest);
    // Write records using the Push provider.
    JavaPairRDD<?, ?> pairRdd = ((JavaRDD) collection.getUnderlying()).flatMapToPair(new TransformToPairFunction<>(pushDataset.toKeyValue()));
    RDDUtils.saveUsingOutputFormat(pushDataset, pairRdd);
    return pushDataset;
}
Also used : StructType(org.apache.spark.sql.types.StructType) SQLPushRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPushRequest) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) PushCapability(io.cdap.cdap.etl.api.engine.sql.capability.PushCapability) SQLDatasetConsumer(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDatasetConsumer) RecordCollection(io.cdap.cdap.etl.api.engine.sql.dataset.RecordCollection) SparkRecordCollection(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollection) SparkRecordCollectionImpl(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl) Row(org.apache.spark.sql.Row)

Example 84 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class SampleResponseCodecTest method testCodec.

@Test
public void testCodec() throws Exception {
    // schema with all types
    Schema schema = Schema.recordOf("schema", Schema.Field.of("f1", Schema.of(Schema.Type.INT)), Schema.Field.of("f2", Schema.of(Schema.Type.STRING)), Schema.Field.of("f3", Schema.of(Schema.Type.LONG)), Schema.Field.of("f4", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("f5", Schema.of(Schema.Type.BYTES)), Schema.Field.of("f6", Schema.of(Schema.Type.BOOLEAN)), Schema.Field.of("f7", Schema.of(Schema.Type.FLOAT)), Schema.Field.of("f8", Schema.of(Schema.LogicalType.DATE)), Schema.Field.of("f9", Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)), Schema.Field.of("f10", Schema.of(Schema.LogicalType.TIMESTAMP_MILLIS)), Schema.Field.of("f11", Schema.of(Schema.LogicalType.TIME_MICROS)), Schema.Field.of("f12", Schema.of(Schema.LogicalType.TIME_MILLIS)), Schema.Field.of("f13", Schema.decimalOf(3, 2)), Schema.Field.of("f14", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("n1", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("n2", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("n3", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("n4", Schema.nullableOf(Schema.of(Schema.Type.DOUBLE))), Schema.Field.of("n5", Schema.nullableOf(Schema.of(Schema.Type.BYTES))), Schema.Field.of("n6", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))), Schema.Field.of("n7", Schema.nullableOf(Schema.of(Schema.Type.FLOAT))), Schema.Field.of("n8", Schema.nullableOf(Schema.of(Schema.LogicalType.DATE))), Schema.Field.of("n9", Schema.nullableOf(Schema.of(Schema.LogicalType.TIMESTAMP_MICROS))), Schema.Field.of("n10", Schema.nullableOf(Schema.of(Schema.LogicalType.TIMESTAMP_MILLIS))), Schema.Field.of("n11", Schema.nullableOf(Schema.of(Schema.LogicalType.TIME_MICROS))), Schema.Field.of("n12", Schema.nullableOf(Schema.of(Schema.LogicalType.TIME_MILLIS))), Schema.Field.of("n13", Schema.nullableOf(Schema.decimalOf(3, 2))), Schema.Field.of("n14", Schema.nullableOf(Schema.of(Schema.LogicalType.DATETIME))));
    // all nullable fields are null
    StructuredRecord record1 = StructuredRecord.builder(schema).set("f1", 1).set("f2", "aaa").set("f3", 1L).set("f4", 0d).set("f5", ByteBuffer.wrap("test".getBytes(Charsets.UTF_8))).set("f6", true).set("f7", 0f).setDate("f8", LocalDate.now()).setTimestamp("f9", ZonedDateTime.now()).setTimestamp("f10", ZonedDateTime.now()).setTime("f11", LocalTime.now()).setTime("f12", LocalTime.now()).set("f13", ByteBuffer.wrap(new BigDecimal(new BigInteger("111"), 2).unscaledValue().toByteArray())).setDateTime("f14", LocalDateTime.now()).build();
    // all fields are filled
    StructuredRecord record2 = StructuredRecord.builder(schema).set("f1", 1).set("f2", "aaa").set("f3", 1L).set("f4", 0d).set("f5", ByteBuffer.wrap("test".getBytes(Charsets.UTF_8))).set("f6", true).set("f7", 0f).setDate("f8", LocalDate.now()).setTimestamp("f9", ZonedDateTime.now()).setTimestamp("f10", ZonedDateTime.now()).setTime("f11", LocalTime.now()).setTime("f12", LocalTime.now()).set("f13", ByteBuffer.wrap(new BigDecimal(new BigInteger("111"), 2).unscaledValue().toByteArray())).setDateTime("f14", LocalDateTime.now()).set("n1", 1).set("n2", "aaa").set("n3", 1L).set("n4", 0d).set("n5", ByteBuffer.wrap("test".getBytes(Charsets.UTF_8))).set("n6", true).set("n7", 0f).setDate("n8", LocalDate.now()).setTimestamp("n9", ZonedDateTime.now()).setTimestamp("n10", ZonedDateTime.now()).setTime("n11", LocalTime.now()).setTime("n12", LocalTime.now()).set("n13", ByteBuffer.wrap(new BigDecimal(new BigInteger("111"), 2).unscaledValue().toByteArray())).setDateTime("n14", LocalDateTime.now()).build();
    List<StructuredRecord> sample = ImmutableList.of(record1, record2);
    SampleResponse sampleResponse = new SampleResponse(new ConnectorDetail(ImmutableSet.of(new PluginDetail("file", "batchsource", ImmutableMap.of("k1", "v1", "k2", "v2"), new ArtifactSelectorConfig(), schema))), schema, sample);
    String jsonString = GSON.toJson(sampleResponse);
    SampleResponse deserialized = GSON.fromJson(jsonString, SampleResponse.class);
    Assert.assertEquals(sampleResponse, deserialized);
}
Also used : ArtifactSelectorConfig(io.cdap.cdap.etl.proto.ArtifactSelectorConfig) Schema(io.cdap.cdap.api.data.schema.Schema) BigInteger(java.math.BigInteger) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) BigDecimal(java.math.BigDecimal) Test(org.junit.Test)

Example 85 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class ErrorCollector method transform.

@Override
public void transform(ErrorRecord<StructuredRecord> input, Emitter<StructuredRecord> emitter) throws Exception {
    StructuredRecord invalidRecord = input.getRecord();
    StructuredRecord.Builder output = StructuredRecord.builder(getOutputSchema(config, invalidRecord.getSchema()));
    for (Schema.Field field : invalidRecord.getSchema().getFields()) {
        output.set(field.getName(), invalidRecord.get(field.getName()));
    }
    if (config.messageField != null) {
        output.set(config.messageField, input.getErrorMessage());
    }
    if (config.codeField != null) {
        output.set(config.codeField, input.getErrorCode());
    }
    if (config.stageField != null) {
        output.set(config.stageField, input.getStageName());
    }
    emitter.emit(output.build());
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)210 Schema (io.cdap.cdap.api.data.schema.Schema)169 Test (org.junit.Test)119 Table (io.cdap.cdap.api.dataset.table.Table)76 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)73 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)73 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)68 ApplicationManager (io.cdap.cdap.test.ApplicationManager)68 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)59 WorkflowManager (io.cdap.cdap.test.WorkflowManager)54 HashSet (java.util.HashSet)50 ArrayList (java.util.ArrayList)44 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)40 HashMap (java.util.HashMap)25 File (java.io.File)17 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)16 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)15 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)14 SparkManager (io.cdap.cdap.test.SparkManager)12 Map (java.util.Map)12