Search in sources :

Example 46 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class GrokRecordFormat method validateSchema.

@Override
protected void validateSchema(Schema desiredSchema) throws UnsupportedTypeException {
    // a valid schema is a record of simple types. In other words, no maps, arrays, records, unions, or enums allowed.
    // the exception is the very last field, which is allowed to be an array of simple types.
    // These types may be nullable, which is a union of a null and non-null type.
    Iterator<Schema.Field> fields = desiredSchema.getFields().iterator();
    // check that each field is a simple field, except for the very last field, which can be an array of simple types.
    while (fields.hasNext()) {
        Schema.Field field = fields.next();
        Schema schema = field.getSchema();
        // if we're not on the very last field, the field must be a simple type or a nullable simple type.
        boolean isSimple = schema.getType().isSimpleType();
        boolean isNullableSimple = schema.isNullableSimple();
        if (!isSimple && !isNullableSimple) {
            // if this is the very last field and a string array, it is valid. otherwise it is not.
            if (fields.hasNext() || !isStringArray(schema)) {
                throw new UnsupportedTypeException("Field " + field.getName() + " is of invalid type.");
            }
        }
    }
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException)

Example 47 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class RecordPutTransformer method toPut.

public Put toPut(StructuredRecord record) {
    Schema recordSchema = record.getSchema();
    Preconditions.checkArgument(recordSchema.getType() == Schema.Type.RECORD, "input must be a record.");
    Schema.Field keyField = getKeyField(recordSchema);
    Preconditions.checkArgument(keyField != null, "Could not find key field in record.");
    Put output = createPut(record, keyField);
    for (Schema.Field field : recordSchema.getFields()) {
        if (field.getName().equals(keyField.getName())) {
            continue;
        }
        // Skip fields that are not present in the Output Schema
        if (outputSchema != null && outputSchema.getField(field.getName()) == null) {
            continue;
        }
        setField(output, field, record.get(field.getName()));
    }
    return output;
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) Put(co.cask.cdap.api.dataset.table.Put)

Example 48 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class StructuredRecordDatumReader method read.

@Override
protected Object read(Object old, org.apache.avro.Schema expected, ResolvingDecoder in) throws IOException {
    if (expected.getType() != org.apache.avro.Schema.Type.UNION) {
        return super.read(old, expected, in);
    }
    // For Union type
    Schema tmpSchema = currentSchema;
    try {
        int idx = in.readIndex();
        currentSchema = currentSchema.getUnionSchema(idx);
        return read(old, expected.getTypes().get(idx), in);
    } finally {
        currentSchema = tmpSchema;
    }
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema)

Example 49 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class StructuredRecordDatumReader method readRecord.

@Override
protected Object readRecord(Object old, org.apache.avro.Schema expected, ResolvingDecoder in) throws IOException {
    StructuredRecord.Builder builder = StructuredRecord.builder(currentSchema);
    for (org.apache.avro.Schema.Field f : in.readFieldOrder()) {
        String name = f.name();
        Schema tmpSchema = currentSchema;
        try {
            currentSchema = getFieldSchema(name, currentSchema);
            builder.set(name, read(null, f.schema(), in));
        } finally {
            currentSchema = tmpSchema;
        }
    }
    return builder.build();
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 50 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class FlowletProgramRunner method outputEmitterFactory.

private OutputEmitterFactory outputEmitterFactory(final BasicFlowletContext flowletContext, final String flowletName, final QueueClientFactory queueClientFactory, final ImmutableList.Builder<ProducerSupplier> producerBuilder, final Table<Node, String, Set<QueueSpecification>> queueSpecs) {
    return new OutputEmitterFactory() {

        @Override
        public <T> OutputEmitter<T> create(String outputName, TypeToken<T> type) {
            try {
                // first iterate over all queue specifications to find the queue name and all consumer flowlet ids
                QueueName queueName = null;
                List<String> consumerFlowlets = Lists.newLinkedList();
                Node flowlet = Node.flowlet(flowletName);
                Schema schema = schemaGenerator.generate(type.getType());
                for (Map.Entry<String, Set<QueueSpecification>> entry : queueSpecs.row(flowlet).entrySet()) {
                    for (QueueSpecification queueSpec : entry.getValue()) {
                        if (queueSpec.getQueueName().getSimpleName().equals(outputName) && queueSpec.getOutputSchema().equals(schema)) {
                            queueName = queueSpec.getQueueName();
                            consumerFlowlets.add(entry.getKey());
                            break;
                        }
                    }
                }
                if (queueName == null) {
                    throw new IllegalArgumentException(String.format("No queue specification found for %s, %s", flowletName, type));
                }
                // create a metric collector for this queue, and also one for each consumer flowlet
                final MetricsContext metrics = flowletContext.getProgramMetrics().childContext(Constants.Metrics.Tag.FLOWLET_QUEUE, outputName);
                final MetricsContext producerMetrics = metrics.childContext(Constants.Metrics.Tag.PRODUCER, flowletContext.getFlowletId());
                final Iterable<MetricsContext> consumerMetrics = Iterables.transform(consumerFlowlets, new Function<String, MetricsContext>() {

                    @Override
                    public MetricsContext apply(String consumer) {
                        return producerMetrics.childContext(Constants.Metrics.Tag.CONSUMER, consumer);
                    }
                });
                // create a queue metrics emitter that emit to all of the above collectors
                ProducerSupplier producerSupplier = new ProducerSupplier(queueName, queueClientFactory, new QueueMetrics() {

                    @Override
                    public void emitEnqueue(int count) {
                        metrics.increment("process.events.out", count);
                        for (MetricsContext collector : consumerMetrics) {
                            collector.increment("queue.pending", count);
                        }
                    }

                    @Override
                    public void emitEnqueueBytes(int bytes) {
                    // no-op
                    }
                });
                producerBuilder.add(producerSupplier);
                return new DatumOutputEmitter<>(producerSupplier, schema, datumWriterFactory.create(type, schema));
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        }
    };
}
Also used : Set(java.util.Set) ImmutableSet(com.google.common.collect.ImmutableSet) Node(co.cask.cdap.app.queue.QueueSpecificationGenerator.Node) Schema(co.cask.cdap.api.data.schema.Schema) MetricsContext(co.cask.cdap.api.metrics.MetricsContext) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) IOException(java.io.IOException) QueueMetrics(co.cask.cdap.data2.transaction.queue.QueueMetrics) TypeToken(com.google.common.reflect.TypeToken) QueueSpecification(co.cask.cdap.app.queue.QueueSpecification) QueueName(co.cask.cdap.common.queue.QueueName) Map(java.util.Map)

Aggregations

Schema (co.cask.cdap.api.data.schema.Schema)210 Test (org.junit.Test)92 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)69 Table (co.cask.cdap.api.dataset.table.Table)38 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)35 ApplicationId (co.cask.cdap.proto.id.ApplicationId)34 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)32 ApplicationManager (co.cask.cdap.test.ApplicationManager)30 AppRequest (co.cask.cdap.proto.artifact.AppRequest)29 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)24 IOException (java.io.IOException)23 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)22 ReflectionSchemaGenerator (co.cask.cdap.internal.io.ReflectionSchemaGenerator)22 ArrayList (java.util.ArrayList)22 WorkflowManager (co.cask.cdap.test.WorkflowManager)20 Map (java.util.Map)18 Set (java.util.Set)14 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)12 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11