Search in sources :

Example 51 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class FlowletProgramRunner method processSpecificationFactory.

private ProcessSpecificationFactory processSpecificationFactory(final BasicFlowletContext flowletContext, final DataFabricFacade dataFabricFacade, final QueueReaderFactory queueReaderFactory, final String flowletName, final Table<Node, String, Set<QueueSpecification>> queueSpecs, final ImmutableList.Builder<ConsumerSupplier<?>> queueConsumerSupplierBuilder, final SchemaCache schemaCache) {
    return new ProcessSpecificationFactory() {

        @Override
        public <T> ProcessSpecification create(Set<String> inputNames, Schema schema, TypeToken<T> dataType, ProcessMethod<T> method, ConsumerConfig consumerConfig, int batchSize, Tick tickAnnotation) throws Exception {
            List<QueueReader<T>> queueReaders = Lists.newLinkedList();
            for (Map.Entry<Node, Set<QueueSpecification>> entry : queueSpecs.column(flowletName).entrySet()) {
                for (QueueSpecification queueSpec : entry.getValue()) {
                    final QueueName queueName = queueSpec.getQueueName();
                    if (queueSpec.getInputSchema().equals(schema) && (inputNames.contains(queueName.getSimpleName()) || inputNames.contains(FlowletDefinition.ANY_INPUT))) {
                        Node sourceNode = entry.getKey();
                        if (sourceNode.getType() == FlowletConnection.Type.STREAM) {
                            ConsumerSupplier<StreamConsumer> consumerSupplier = ConsumerSupplier.create(flowletContext.getOwners(), runtimeUsageRegistry, dataFabricFacade, queueName, consumerConfig);
                            queueConsumerSupplierBuilder.add(consumerSupplier);
                            // No decoding is needed, as a process method can only have StreamEvent as type for consuming stream
                            Function<StreamEvent, T> decoder = wrapInputDecoder(flowletContext, null, queueName, new Function<StreamEvent, T>() {

                                @Override
                                @SuppressWarnings("unchecked")
                                public T apply(StreamEvent input) {
                                    return (T) input;
                                }
                            });
                            queueReaders.add(queueReaderFactory.createStreamReader(queueName.toStreamId(), consumerSupplier, batchSize, decoder));
                        } else {
                            int numGroups = getNumGroups(Iterables.concat(queueSpecs.row(entry.getKey()).values()), queueName);
                            Function<ByteBuffer, T> decoder = wrapInputDecoder(// the producer flowlet,
                            flowletContext, // the producer flowlet,
                            entry.getKey().getName(), queueName, createInputDatumDecoder(dataType, schema, schemaCache));
                            ConsumerSupplier<QueueConsumer> consumerSupplier = ConsumerSupplier.create(flowletContext.getOwners(), runtimeUsageRegistry, dataFabricFacade, queueName, consumerConfig, numGroups);
                            queueConsumerSupplierBuilder.add(consumerSupplier);
                            queueReaders.add(queueReaderFactory.createQueueReader(consumerSupplier, batchSize, decoder));
                        }
                    }
                }
            }
            // If inputs is needed but there is no available input queue, return null
            if (!inputNames.isEmpty() && queueReaders.isEmpty()) {
                return null;
            }
            return new ProcessSpecification<>(new RoundRobinQueueReader<>(queueReaders), method, tickAnnotation);
        }
    };
}
Also used : QueueReader(co.cask.cdap.app.queue.QueueReader) RoundRobinQueueReader(co.cask.cdap.internal.app.queue.RoundRobinQueueReader) Set(java.util.Set) ImmutableSet(com.google.common.collect.ImmutableSet) Schema(co.cask.cdap.api.data.schema.Schema) Node(co.cask.cdap.app.queue.QueueSpecificationGenerator.Node) ConsumerConfig(co.cask.cdap.data2.queue.ConsumerConfig) Tick(co.cask.cdap.api.annotation.Tick) QueueName(co.cask.cdap.common.queue.QueueName) StreamConsumer(co.cask.cdap.data2.transaction.stream.StreamConsumer) StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) ByteBuffer(java.nio.ByteBuffer) QueueConsumer(co.cask.cdap.data2.queue.QueueConsumer) TypeToken(com.google.common.reflect.TypeToken) QueueSpecification(co.cask.cdap.app.queue.QueueSpecification) Map(java.util.Map)

Example 52 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class FlowletProgramRunner method createSchemaCache.

private SchemaCache createSchemaCache(Program program) throws Exception {
    ImmutableSet.Builder<Schema> schemas = ImmutableSet.builder();
    for (FlowSpecification flowSpec : program.getApplicationSpecification().getFlows().values()) {
        for (FlowletDefinition flowletDef : flowSpec.getFlowlets().values()) {
            schemas.addAll(Iterables.concat(flowletDef.getInputs().values()));
            schemas.addAll(Iterables.concat(flowletDef.getOutputs().values()));
        }
    }
    // Temp fix for ENG-3949. Always add old stream event schema.
    // TODO: Remove it later. The right thing to do is to have schemas history being stored to support schema
    // evolution. By design, as long as the schema cache is populated with old schema, the type projection logic
    // in the decoder would handle it correctly.
    schemas.add(schemaGenerator.generate(StreamEventData.class));
    return new SchemaCache(schemas.build(), program.getClassLoader());
}
Also used : FlowletDefinition(co.cask.cdap.api.flow.FlowletDefinition) ImmutableSet(com.google.common.collect.ImmutableSet) FlowSpecification(co.cask.cdap.api.flow.FlowSpecification) Schema(co.cask.cdap.api.data.schema.Schema) StreamEventData(co.cask.cdap.api.stream.StreamEventData)

Example 53 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class StreamBatchSource method transform.

@Override
public void transform(KeyValue<LongWritable, Object> input, Emitter<StructuredRecord> emitter) throws Exception {
    // if not format spec was given, the value is a StreamEvent
    if (Strings.isNullOrEmpty(streamBatchConfig.format)) {
        StreamEvent event = (StreamEvent) input.getValue();
        Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
        StructuredRecord output = StructuredRecord.builder(DEFAULT_SCHEMA).set("ts", input.getKey().get()).set("headers", headers).set("body", event.getBody()).build();
        emitter.emit(output);
    } else {
        // otherwise, it will be a GenericStreamEventData
        @SuppressWarnings("unchecked") GenericStreamEventData<StructuredRecord> event = (GenericStreamEventData<StructuredRecord>) input.getValue();
        StructuredRecord record = event.getBody();
        Schema inputSchema = record.getSchema();
        Schema outputSchema = schemaCache.get(inputSchema);
        // if we haven't seen this schema before, generate the output schema (add ts and header fields)
        if (outputSchema == null) {
            List<Schema.Field> fields = Lists.newArrayList();
            fields.add(DEFAULT_SCHEMA.getField("ts"));
            fields.add(DEFAULT_SCHEMA.getField("headers"));
            fields.addAll(inputSchema.getFields());
            outputSchema = Schema.recordOf(inputSchema.getRecordName(), fields);
            schemaCache.put(inputSchema, outputSchema);
        }
        // easier to just deal with an empty map than deal with nullables, so the headers field is non-nullable.
        Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
        StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema);
        builder.set("ts", input.getKey().get());
        builder.set("headers", headers);
        for (Schema.Field field : inputSchema.getFields()) {
            String fieldName = field.getName();
            builder.set(fieldName, record.get(fieldName));
        }
        emitter.emit(builder.build());
    }
}
Also used : StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) Schema(co.cask.cdap.api.data.schema.Schema) GenericStreamEventData(co.cask.cdap.api.stream.GenericStreamEventData) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 54 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class LookupTransform method transform.

@Override
public void transform(StructuredRecord input, Emitter<StructuredRecord> emitter) throws Exception {
    T lookedUpValue = lookup.lookup((String) input.get(config.lookupKey));
    // for the output schema, copy all the input fields, and add the 'destinationField'
    List<Schema.Field> outFields = new ArrayList<>();
    for (Schema.Field field : input.getSchema().getFields()) {
        outFields.add(field);
    }
    if (lookedUpValue instanceof String) {
        outFields.add(Schema.Field.of(config.destinationField, Schema.of(Schema.Type.STRING)));
    } else if (lookedUpValue instanceof Row) {
        Row lookedupRow = (Row) lookedUpValue;
        for (byte[] column : lookedupRow.getColumns().keySet()) {
            outFields.add(Schema.Field.of(Bytes.toString(column), Schema.of(Schema.Type.STRING)));
        }
    } else {
        throw new IllegalArgumentException("Unexpected value type: " + lookedUpValue.getClass());
    }
    Schema outSchema = Schema.recordOf(input.getSchema().getRecordName(), outFields);
    // copy all the values
    StructuredRecord.Builder outputBuilder = StructuredRecord.builder(outSchema);
    for (Schema.Field inField : input.getSchema().getFields()) {
        if (inField.getName().equals(config.lookupKey)) {
            if (lookedUpValue instanceof String) {
                outputBuilder.set(config.destinationField, lookedUpValue);
            } else {
                // due to the check above, we know its a Row
                Row lookedupRow = (Row) lookedUpValue;
                for (Map.Entry<byte[], byte[]> entry : lookedupRow.getColumns().entrySet()) {
                    outputBuilder.set(Bytes.toString(entry.getKey()), Bytes.toString(entry.getValue()));
                }
            }
        }
        // what if the destinationField already exists?
        outputBuilder.set(inField.getName(), input.get(inField.getName()));
    }
    emitter.emit(outputBuilder.build());
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) PluginPropertyField(co.cask.cdap.api.plugin.PluginPropertyField) Row(co.cask.cdap.api.dataset.table.Row) HashMap(java.util.HashMap) Map(java.util.Map)

Example 55 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class DupeFlagger method merge.

@Override
public StructuredRecord merge(StructuredRecord joinKey, Iterable<JoinElement<StructuredRecord>> joinRow) {
    StructuredRecord record = null;
    boolean containsDupe = false;
    for (JoinElement<StructuredRecord> element : joinRow) {
        if (element.getStageName().equals(config.keep)) {
            record = element.getInputRecord();
        } else {
            containsDupe = true;
        }
    }
    if (record == null) {
        // can only happen if 'keep' was a macro and did not evaluate to one of the inputs
        throw new IllegalArgumentException("No record for " + config.keep + " was found.");
    }
    Schema outputSchema = getOutputSchema(record.getSchema());
    StructuredRecord.Builder outputBuilder = StructuredRecord.builder(outputSchema).set(config.flagField, containsDupe);
    for (Schema.Field field : record.getSchema().getFields()) {
        outputBuilder.set(field.getName(), record.get(field.getName()));
    }
    return outputBuilder.build();
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Aggregations

Schema (co.cask.cdap.api.data.schema.Schema)210 Test (org.junit.Test)92 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)69 Table (co.cask.cdap.api.dataset.table.Table)38 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)35 ApplicationId (co.cask.cdap.proto.id.ApplicationId)34 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)32 ApplicationManager (co.cask.cdap.test.ApplicationManager)30 AppRequest (co.cask.cdap.proto.artifact.AppRequest)29 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)24 IOException (java.io.IOException)23 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)22 ReflectionSchemaGenerator (co.cask.cdap.internal.io.ReflectionSchemaGenerator)22 ArrayList (java.util.ArrayList)22 WorkflowManager (co.cask.cdap.test.WorkflowManager)20 Map (java.util.Map)18 Set (java.util.Set)14 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)12 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11