Search in sources :

Example 21 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class CombinedLogRecordFormatTest method testCLFLogWithNull.

@Test
public void testCLFLogWithNull() throws UnsupportedTypeException, UnexpectedFormatException {
    CombinedLogRecordFormat format = new CombinedLogRecordFormat();
    FormatSpecification spec = new FormatSpecification(CombinedLogRecordFormat.class.getCanonicalName(), null, ImmutableMap.<String, String>of());
    format.initialize(spec);
    String data = "10.10.10.10 - - [01/Feb/2015:09:58:24 +0000] \"-\" 408 - \"-\" \"-\"";
    StructuredRecord output = format.read(new StreamEvent(ByteBuffer.wrap(Bytes.toBytes(data))));
    Assert.assertEquals("10.10.10.10", output.get("remote_host"));
    Assert.assertNull(output.get("remote_login"));
    Assert.assertNull(output.get("auth_user"));
    Assert.assertEquals("01/Feb/2015:09:58:24 +0000", output.get("request_time"));
    Assert.assertNull(output.get("request"));
    Assert.assertEquals(408, output.get("status"));
    Assert.assertNull(output.get("content_length"));
    Assert.assertNull(output.get("referrer"));
    Assert.assertNull(output.get("user_agent"));
}
Also used : StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Test(org.junit.Test)

Example 22 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class StreamBatchSource method transform.

@Override
public void transform(KeyValue<LongWritable, Object> input, Emitter<StructuredRecord> emitter) throws Exception {
    // if not format spec was given, the value is a StreamEvent
    if (Strings.isNullOrEmpty(streamBatchConfig.format)) {
        StreamEvent event = (StreamEvent) input.getValue();
        Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
        StructuredRecord output = StructuredRecord.builder(DEFAULT_SCHEMA).set("ts", input.getKey().get()).set("headers", headers).set("body", event.getBody()).build();
        emitter.emit(output);
    } else {
        // otherwise, it will be a GenericStreamEventData
        @SuppressWarnings("unchecked") GenericStreamEventData<StructuredRecord> event = (GenericStreamEventData<StructuredRecord>) input.getValue();
        StructuredRecord record = event.getBody();
        Schema inputSchema = record.getSchema();
        Schema outputSchema = schemaCache.get(inputSchema);
        // if we haven't seen this schema before, generate the output schema (add ts and header fields)
        if (outputSchema == null) {
            List<Schema.Field> fields = Lists.newArrayList();
            fields.add(DEFAULT_SCHEMA.getField("ts"));
            fields.add(DEFAULT_SCHEMA.getField("headers"));
            fields.addAll(inputSchema.getFields());
            outputSchema = Schema.recordOf(inputSchema.getRecordName(), fields);
            schemaCache.put(inputSchema, outputSchema);
        }
        // easier to just deal with an empty map than deal with nullables, so the headers field is non-nullable.
        Map<String, String> headers = Objects.firstNonNull(event.getHeaders(), ImmutableMap.<String, String>of());
        StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema);
        builder.set("ts", input.getKey().get());
        builder.set("headers", headers);
        for (Schema.Field field : inputSchema.getFields()) {
            String fieldName = field.getName();
            builder.set(fieldName, record.get(fieldName));
        }
        emitter.emit(builder.build());
    }
}
Also used : StreamEvent(co.cask.cdap.api.flow.flowlet.StreamEvent) Schema(co.cask.cdap.api.data.schema.Schema) GenericStreamEventData(co.cask.cdap.api.stream.GenericStreamEventData) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 23 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class LookupTransform method transform.

@Override
public void transform(StructuredRecord input, Emitter<StructuredRecord> emitter) throws Exception {
    T lookedUpValue = lookup.lookup((String) input.get(config.lookupKey));
    // for the output schema, copy all the input fields, and add the 'destinationField'
    List<Schema.Field> outFields = new ArrayList<>();
    for (Schema.Field field : input.getSchema().getFields()) {
        outFields.add(field);
    }
    if (lookedUpValue instanceof String) {
        outFields.add(Schema.Field.of(config.destinationField, Schema.of(Schema.Type.STRING)));
    } else if (lookedUpValue instanceof Row) {
        Row lookedupRow = (Row) lookedUpValue;
        for (byte[] column : lookedupRow.getColumns().keySet()) {
            outFields.add(Schema.Field.of(Bytes.toString(column), Schema.of(Schema.Type.STRING)));
        }
    } else {
        throw new IllegalArgumentException("Unexpected value type: " + lookedUpValue.getClass());
    }
    Schema outSchema = Schema.recordOf(input.getSchema().getRecordName(), outFields);
    // copy all the values
    StructuredRecord.Builder outputBuilder = StructuredRecord.builder(outSchema);
    for (Schema.Field inField : input.getSchema().getFields()) {
        if (inField.getName().equals(config.lookupKey)) {
            if (lookedUpValue instanceof String) {
                outputBuilder.set(config.destinationField, lookedUpValue);
            } else {
                // due to the check above, we know its a Row
                Row lookedupRow = (Row) lookedUpValue;
                for (Map.Entry<byte[], byte[]> entry : lookedupRow.getColumns().entrySet()) {
                    outputBuilder.set(Bytes.toString(entry.getKey()), Bytes.toString(entry.getValue()));
                }
            }
        }
        // what if the destinationField already exists?
        outputBuilder.set(inField.getName(), input.get(inField.getName()));
    }
    emitter.emit(outputBuilder.build());
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) PluginPropertyField(co.cask.cdap.api.plugin.PluginPropertyField) Row(co.cask.cdap.api.dataset.table.Row) HashMap(java.util.HashMap) Map(java.util.Map)

Example 24 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class DupeFlagger method merge.

@Override
public StructuredRecord merge(StructuredRecord joinKey, Iterable<JoinElement<StructuredRecord>> joinRow) {
    StructuredRecord record = null;
    boolean containsDupe = false;
    for (JoinElement<StructuredRecord> element : joinRow) {
        if (element.getStageName().equals(config.keep)) {
            record = element.getInputRecord();
        } else {
            containsDupe = true;
        }
    }
    if (record == null) {
        // can only happen if 'keep' was a macro and did not evaluate to one of the inputs
        throw new IllegalArgumentException("No record for " + config.keep + " was found.");
    }
    Schema outputSchema = getOutputSchema(record.getSchema());
    StructuredRecord.Builder outputBuilder = StructuredRecord.builder(outputSchema).set(config.flagField, containsDupe);
    for (Schema.Field field : record.getSchema().getFields()) {
        outputBuilder.set(field.getName(), record.get(field.getName()));
    }
    return outputBuilder.build();
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 25 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class MockRuntimeDatasetSink method readOutput.

/**
   * Used to read the records written by this sink.
   *
   * @param tableManager dataset manager used to get the sink dataset to read from
   */
public static List<StructuredRecord> readOutput(DataSetManager<Table> tableManager) throws Exception {
    Table table = tableManager.get();
    try (Scanner scanner = table.scan(null, null)) {
        List<StructuredRecord> records = new ArrayList<>();
        Row row;
        while ((row = scanner.next()) != null) {
            Schema schema = Schema.parseJson(row.getString(SCHEMA_COL));
            String recordStr = row.getString(RECORD_COL);
            records.add(StructuredRecordStringConverter.fromJsonString(recordStr, schema));
        }
        return records;
    }
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) Row(co.cask.cdap.api.dataset.table.Row) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Aggregations

StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)97 Schema (co.cask.cdap.api.data.schema.Schema)71 Test (org.junit.Test)51 Table (co.cask.cdap.api.dataset.table.Table)36 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)36 ApplicationId (co.cask.cdap.proto.id.ApplicationId)36 ApplicationManager (co.cask.cdap.test.ApplicationManager)33 AppRequest (co.cask.cdap.proto.artifact.AppRequest)31 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)25 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)25 WorkflowManager (co.cask.cdap.test.WorkflowManager)23 ArrayList (java.util.ArrayList)20 StreamEvent (co.cask.cdap.api.flow.flowlet.StreamEvent)19 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)18 HashSet (java.util.HashSet)10 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)8 File (java.io.File)8 TimeoutException (java.util.concurrent.TimeoutException)8 Put (co.cask.cdap.api.dataset.table.Put)7 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)7