use of co.cask.cdap.api.stream.GenericStreamEventData in project cdap by caskdata.
the class StreamBatchSource method transform.
@Override
public void transform(KeyValue<LongWritable, Object> input, Emitter<StructuredRecord> emitter) throws Exception {
// if not format spec was given, the value is a StreamEvent
if (Strings.isNullOrEmpty(streamBatchConfig.format)) {
StreamEvent event = (StreamEvent) input.getValue();
Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
StructuredRecord output = StructuredRecord.builder(DEFAULT_SCHEMA).set("ts", input.getKey().get()).set("headers", headers).set("body", event.getBody()).build();
emitter.emit(output);
} else {
// otherwise, it will be a GenericStreamEventData
@SuppressWarnings("unchecked") GenericStreamEventData<StructuredRecord> event = (GenericStreamEventData<StructuredRecord>) input.getValue();
StructuredRecord record = event.getBody();
Schema inputSchema = record.getSchema();
Schema outputSchema = schemaCache.get(inputSchema);
// if we haven't seen this schema before, generate the output schema (add ts and header fields)
if (outputSchema == null) {
List<Schema.Field> fields = Lists.newArrayList();
fields.add(DEFAULT_SCHEMA.getField("ts"));
fields.add(DEFAULT_SCHEMA.getField("headers"));
fields.addAll(inputSchema.getFields());
outputSchema = Schema.recordOf(inputSchema.getRecordName(), fields);
schemaCache.put(inputSchema, outputSchema);
}
// easier to just deal with an empty map than deal with nullables, so the headers field is non-nullable.
Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema);
builder.set("ts", input.getKey().get());
builder.set("headers", headers);
for (Schema.Field field : inputSchema.getFields()) {
String fieldName = field.getName();
builder.set(fieldName, record.get(fieldName));
}
emitter.emit(builder.build());
}
}
use of co.cask.cdap.api.stream.GenericStreamEventData in project cdap by caskdata.
the class StreamInputFormatTest method testFormatStreamRecordReader.
@Test
public void testFormatStreamRecordReader() throws IOException, InterruptedException {
File inputDir = tmpFolder.newFolder();
File partition = new File(inputDir, "1.1000");
partition.mkdirs();
File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());
// write 1 event
StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile), Files.newOutputStreamSupplier(indexFile), 100L);
StreamEvent streamEvent = new StreamEvent(ImmutableMap.of("header1", "value1", "header2", "value2"), Charsets.UTF_8.encode("hello world"), 1000);
writer.append(streamEvent);
writer.close();
FormatSpecification formatSpec = new FormatSpecification(TextRecordFormat.class.getName(), Schema.recordOf("event", Schema.Field.of("body", Schema.of(Schema.Type.STRING))), Collections.<String, String>emptyMap());
Configuration conf = new Configuration();
AbstractStreamInputFormat.setStreamId(conf, DUMMY_ID);
AbstractStreamInputFormat.setBodyFormatSpecification(conf, formatSpec);
AbstractStreamInputFormat.setStreamPath(conf, inputDir.toURI());
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
AbstractStreamInputFormat format = new AbstractStreamInputFormat() {
@Override
public AuthorizationEnforcer getAuthorizationEnforcer(TaskAttemptContext context) {
return new NoOpAuthorizer();
}
@Override
public AuthenticationContext getAuthenticationContext(TaskAttemptContext context) {
return new AuthenticationTestContext();
}
};
// read all splits and store the results in the list
List<GenericStreamEventData<StructuredRecord>> recordsRead = Lists.newArrayList();
List<InputSplit> inputSplits = format.getSplits(context);
for (InputSplit split : inputSplits) {
RecordReader<LongWritable, GenericStreamEventData<StructuredRecord>> recordReader = format.createRecordReader(split, context);
recordReader.initialize(split, context);
while (recordReader.nextKeyValue()) {
recordsRead.add(recordReader.getCurrentValue());
}
}
// should only have read 1 record
Assert.assertEquals(1, recordsRead.size());
GenericStreamEventData<StructuredRecord> eventData = recordsRead.get(0);
Assert.assertEquals(streamEvent.getHeaders(), eventData.getHeaders());
Assert.assertEquals("hello world", eventData.getBody().get("body"));
}
use of co.cask.cdap.api.stream.GenericStreamEventData in project cdap by caskdata.
the class StreamFormatSpecSpark method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
SQLContext sqlContext = new SQLContext(jsc);
// Read from CSV stream and turn it into a DataFrame
String streamName = sec.getRuntimeArguments().get("stream.name");
Schema schema = Schema.recordOf("record", ImmutableList.of(Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT))));
FormatSpecification formatSpec = new FormatSpecification("csv", schema);
JavaPairRDD<Long, GenericStreamEventData<StructuredRecord>> rdd = sec.fromStream(streamName, formatSpec, StructuredRecord.class);
JavaRDD<Person> personRDD = rdd.values().map(new Function<GenericStreamEventData<StructuredRecord>, Person>() {
@Override
public Person call(GenericStreamEventData<StructuredRecord> data) throws Exception {
StructuredRecord record = data.getBody();
return new Person(record.<String>get("name"), record.<Integer>get("age"));
}
});
sqlContext.createDataFrame(personRDD, Person.class).registerTempTable("people");
// Execute a SQL on the table and save the result
JavaPairRDD<String, Integer> resultRDD = sqlContext.sql(sec.getRuntimeArguments().get("sql.statement")).toJavaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<>(row.getString(0), row.getInt(1));
}
});
sec.saveAsDataset(resultRDD, sec.getRuntimeArguments().get("output.dataset"));
}
Aggregations