Search in sources :

Example 1 with GenericStreamEventData

use of co.cask.cdap.api.stream.GenericStreamEventData in project cdap by caskdata.

the class StreamBatchSource method transform.

@Override
public void transform(KeyValue<LongWritable, Object> input, Emitter<StructuredRecord> emitter) throws Exception {
    // if not format spec was given, the value is a StreamEvent
    if (Strings.isNullOrEmpty(streamBatchConfig.format)) {
        StreamEvent event = (StreamEvent) input.getValue();
        Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
        StructuredRecord output = StructuredRecord.builder(DEFAULT_SCHEMA).set("ts", input.getKey().get()).set("headers", headers).set("body", event.getBody()).build();
        emitter.emit(output);
    } else {
        // otherwise, it will be a GenericStreamEventData
        @SuppressWarnings("unchecked") GenericStreamEventData<StructuredRecord> event = (GenericStreamEventData<StructuredRecord>) input.getValue();
        StructuredRecord record = event.getBody();
        Schema inputSchema = record.getSchema();
        Schema outputSchema = schemaCache.get(inputSchema);
        // if we haven't seen this schema before, generate the output schema (add ts and header fields)
        if (outputSchema == null) {
            List<Schema.Field> fields = Lists.newArrayList();
            fields.add(DEFAULT_SCHEMA.getField("ts"));
            fields.add(DEFAULT_SCHEMA.getField("headers"));
            fields.addAll(inputSchema.getFields());
            outputSchema = Schema.recordOf(inputSchema.getRecordName(), fields);
            schemaCache.put(inputSchema, outputSchema);
        }
        // easier to just deal with an empty map than deal with nullables, so the headers field is non-nullable.
        Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
        StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema);
        builder.set("ts", input.getKey().get());
        builder.set("headers", headers);
        for (Schema.Field field : inputSchema.getFields()) {
            String fieldName = field.getName();
            builder.set(fieldName, record.get(fieldName));
        }
        emitter.emit(builder.build());
    }
}
Also used : StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) Schema(co.cask.cdap.api.data.schema.Schema) GenericStreamEventData(co.cask.cdap.api.stream.GenericStreamEventData) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 2 with GenericStreamEventData

use of co.cask.cdap.api.stream.GenericStreamEventData in project cdap by caskdata.

the class StreamInputFormatTest method testFormatStreamRecordReader.

@Test
public void testFormatStreamRecordReader() throws IOException, InterruptedException {
    File inputDir = tmpFolder.newFolder();
    File partition = new File(inputDir, "1.1000");
    partition.mkdirs();
    File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
    File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
    // write 1 event
    StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile), Files.newOutputStreamSupplier(indexFile), 100L);
    StreamEvent streamEvent = new StreamEvent(ImmutableMap.of("header1", "value1", "header2", "value2"), Charsets.UTF_8.encode("hello world"), 1000);
    writer.append(streamEvent);
    writer.close();
    FormatSpecification formatSpec = new FormatSpecification(TextRecordFormat.class.getName(), Schema.recordOf("event", Schema.Field.of("body", Schema.of(Schema.Type.STRING))), Collections.<String, String>emptyMap());
    Configuration conf = new Configuration();
    AbstractStreamInputFormat.setStreamId(conf, DUMMY_ID);
    AbstractStreamInputFormat.setBodyFormatSpecification(conf, formatSpec);
    AbstractStreamInputFormat.setStreamPath(conf, inputDir.toURI());
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    AbstractStreamInputFormat format = new AbstractStreamInputFormat() {

        @Override
        public AuthorizationEnforcer getAuthorizationEnforcer(TaskAttemptContext context) {
            return new NoOpAuthorizer();
        }

        @Override
        public AuthenticationContext getAuthenticationContext(TaskAttemptContext context) {
            return new AuthenticationTestContext();
        }
    };
    // read all splits and store the results in the list
    List<GenericStreamEventData<StructuredRecord>> recordsRead = Lists.newArrayList();
    List<InputSplit> inputSplits = format.getSplits(context);
    for (InputSplit split : inputSplits) {
        RecordReader<LongWritable, GenericStreamEventData<StructuredRecord>> recordReader = format.createRecordReader(split, context);
        recordReader.initialize(split, context);
        while (recordReader.nextKeyValue()) {
            recordsRead.add(recordReader.getCurrentValue());
        }
    }
    // should only have read 1 record
    Assert.assertEquals(1, recordsRead.size());
    GenericStreamEventData<StructuredRecord> eventData = recordsRead.get(0);
    Assert.assertEquals(streamEvent.getHeaders(), eventData.getHeaders());
    Assert.assertEquals("hello world", eventData.getBody().get("body"));
}
Also used : TextRecordFormat(co.cask.cdap.format.TextRecordFormat) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) AuthenticationTestContext(co.cask.cdap.security.auth.context.AuthenticationTestContext) NoOpAuthorizer(co.cask.cdap.security.spi.authorization.NoOpAuthorizer) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) GenericStreamEventData(co.cask.cdap.api.stream.GenericStreamEventData) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) LongWritable(org.apache.hadoop.io.LongWritable) File(java.io.File) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 3 with GenericStreamEventData

use of co.cask.cdap.api.stream.GenericStreamEventData in project cdap by caskdata.

the class StreamFormatSpecSpark method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    SQLContext sqlContext = new SQLContext(jsc);
    // Read from CSV stream and turn it into a DataFrame
    String streamName = sec.getRuntimeArguments().get("stream.name");
    Schema schema = Schema.recordOf("record", ImmutableList.of(Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT))));
    FormatSpecification formatSpec = new FormatSpecification("csv", schema);
    JavaPairRDD<Long, GenericStreamEventData<StructuredRecord>> rdd = sec.fromStream(streamName, formatSpec, StructuredRecord.class);
    JavaRDD<Person> personRDD = rdd.values().map(new Function<GenericStreamEventData<StructuredRecord>, Person>() {

        @Override
        public Person call(GenericStreamEventData<StructuredRecord> data) throws Exception {
            StructuredRecord record = data.getBody();
            return new Person(record.<String>get("name"), record.<Integer>get("age"));
        }
    });
    sqlContext.createDataFrame(personRDD, Person.class).registerTempTable("people");
    // Execute a SQL on the table and save the result
    JavaPairRDD<String, Integer> resultRDD = sqlContext.sql(sec.getRuntimeArguments().get("sql.statement")).toJavaRDD().mapToPair(new PairFunction<Row, String, Integer>() {

        @Override
        public Tuple2<String, Integer> call(Row row) throws Exception {
            return new Tuple2<>(row.getString(0), row.getInt(1));
        }
    });
    sec.saveAsDataset(resultRDD, sec.getRuntimeArguments().get("output.dataset"));
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) GenericStreamEventData(co.cask.cdap.api.stream.GenericStreamEventData) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3 GenericStreamEventData (co.cask.cdap.api.stream.GenericStreamEventData)3 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)2 Schema (co.cask.cdap.api.data.schema.Schema)2 StreamEvent (co.cask.cdap.api.flow.flowlet.StreamEvent)2 TextRecordFormat (co.cask.cdap.format.TextRecordFormat)1 AuthenticationTestContext (co.cask.cdap.security.auth.context.AuthenticationTestContext)1 NoOpAuthorizer (co.cask.cdap.security.spi.authorization.NoOpAuthorizer)1 File (java.io.File)1 Configuration (org.apache.hadoop.conf.Configuration)1 LongWritable (org.apache.hadoop.io.LongWritable)1 TaskAttemptID (org.apache.hadoop.mapred.TaskAttemptID)1 InputSplit (org.apache.hadoop.mapreduce.InputSplit)1 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)1 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)1 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)1 Row (org.apache.spark.sql.Row)1 SQLContext (org.apache.spark.sql.SQLContext)1 Test (org.junit.Test)1 Tuple2 (scala.Tuple2)1