Search in sources :

Example 51 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class JoinerBridge method getJoinKeys.

@Override
public Collection<StructuredRecord> getJoinKeys(String stageName, INPUT_RECORD record) throws Exception {
    if (!(record instanceof StructuredRecord)) {
        // but it is technically possible.
        throw new IllegalArgumentException(String.format("Received an input record of unsupported type '%s' from stage '%s'.", record.getClass().getName(), stageName));
    }
    List<String> key = joinKeys.get(stageName);
    if (key == null) {
        // this should not happen, it should be caught by the pipeline app at configure or prepare time and failed then
        throw new IllegalArgumentException(String.format("Received data from stage '%s', but the stage was not included as part of the join. " + "Check the plugin to make sure it is including all input stages.", stageName));
    }
    StructuredRecord inputRecord = (StructuredRecord) record;
    if (keySchema == null) {
        keySchema = getKeySchema(stageName, inputRecord.getSchema(), key);
    }
    JoinDistribution distribution = joinDefinition.getDistribution();
    List<StructuredRecord> keyRecords = new ArrayList<>();
    StructuredRecord.Builder keyRecord = getKeyRecordBuilder(key, inputRecord);
    // If distribution is not enabled then return the record without any changes
    if (distribution == null) {
        keyRecords.add(keyRecord.build());
        return keyRecords;
    }
    int distributionFactor = distribution.getDistributionFactor();
    // If this is the skewed stage then we need to add salt
    if (stageName.equals(distribution.getSkewedStageName())) {
        keyRecord.set(SALT_COLUMN, saltGenerator.nextInt(distributionFactor));
        keyRecords.add(keyRecord.build());
        return keyRecords;
    }
    // This is not the skewed stage so we need to explode it
    for (int i = 0; i < distributionFactor; i++) {
        StructuredRecord.Builder recordBuilder = getKeyRecordBuilder(key, inputRecord);
        recordBuilder.set(SALT_COLUMN, i);
        keyRecords.add(recordBuilder.build());
    }
    return keyRecords;
}
Also used : ArrayList(java.util.ArrayList) JoinDistribution(io.cdap.cdap.etl.api.join.JoinDistribution) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Example 52 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class JoinerBridge method generateOutputSchema.

private Schema generateOutputSchema(Iterable<JoinElement<INPUT_RECORD>> elements) {
    Map<String, Schema> stageSchemas = new HashMap<>();
    for (JoinElement<INPUT_RECORD> joinElement : elements) {
        StructuredRecord joinRecord = (StructuredRecord) joinElement.getInputRecord();
        stageSchemas.put(joinElement.getStageName(), joinRecord.getSchema());
    }
    List<Schema.Field> fields = new ArrayList<>(joinDefinition.getSelectedFields().size());
    for (JoinField joinField : joinDefinition.getSelectedFields()) {
        String originalName = joinField.getFieldName();
        String outputName = joinField.getAlias() == null ? originalName : joinField.getAlias();
        Schema stageSchema = stageSchemas.get(joinField.getStageName());
        if (stageSchema == null) {
            // should not be possible, should be validated earlier
            throw new IllegalArgumentException(String.format("Unable to select field '%s' from stage '%s' because data for the stage could not be found.", originalName, joinField.getStageName()));
        }
        Schema.Field stageField = stageSchema.getField(originalName);
        if (stageField == null) {
            // should not be possible, should be validated earlier
            throw new IllegalArgumentException(String.format("Unable to select field '%s' from stage '%s' because the field for the stage could not be found.", originalName, joinField.getStageName()));
        }
        fields.add(Schema.Field.of(outputName, stageField.getSchema()));
    }
    return Schema.recordOf("joined", fields);
}
Also used : HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinField(io.cdap.cdap.etl.api.join.JoinField)

Example 53 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class ReflectionTableTest method testStructuredRecordProjection.

@Test
public void testStructuredRecordProjection() throws Exception {
    dsFrameworkUtil.createInstance("table", users, DatasetProperties.builder().build());
    try {
        final Table usersTable = dsFrameworkUtil.getInstance(users);
        final byte[] rowKey = Bytes.toBytes(123);
        final User2 projected = new User2("Samuel L.", 123L, ((Float) 50000000.02f).doubleValue(), Double.MAX_VALUE, ByteBuffer.wrap(new byte[] { 0, 1, 2 }));
        final Schema fullSchema = new ReflectionSchemaGenerator().generate(User.class);
        final Schema projSchema = new ReflectionSchemaGenerator().generate(User2.class);
        // TableDataset is not accessible here, but we know that's the underlying implementation...
        TransactionExecutor tx = dsFrameworkUtil.newTransactionExecutor((TransactionAware) usersTable);
        tx.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                Put put = new Put(rowKey);
                ReflectionPutWriter<User> putWriter = new ReflectionPutWriter<>(fullSchema);
                putWriter.write(SAMUEL, put);
                usersTable.put(put);
                Row row = usersTable.get(rowKey);
                ReflectionRowRecordReader rowReader = new ReflectionRowRecordReader(projSchema, null);
                StructuredRecord actual = rowReader.read(row, fullSchema);
                assertRecordEqualsUser(projected, actual);
            }
        });
    } finally {
        dsFrameworkUtil.deleteInstance(users);
    }
}
Also used : Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) TransactionExecutor(org.apache.tephra.TransactionExecutor) ReflectionSchemaGenerator(io.cdap.cdap.internal.io.ReflectionSchemaGenerator) Put(io.cdap.cdap.api.dataset.table.Put) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) ReflectionPutWriter(io.cdap.cdap.internal.io.ReflectionPutWriter) Row(io.cdap.cdap.api.dataset.table.Row) ReflectionRowRecordReader(io.cdap.cdap.internal.io.ReflectionRowRecordReader) Test(org.junit.Test)

Example 54 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class FlattenErrorTransform method transform.

@Override
public void transform(ErrorRecord<StructuredRecord> input, Emitter<StructuredRecord> emitter) throws Exception {
    StructuredRecord invalidRecord = input.getRecord();
    StructuredRecord.Builder output = StructuredRecord.builder(getOutputSchema(invalidRecord.getSchema()));
    for (Schema.Field field : invalidRecord.getSchema().getFields()) {
        output.set(field.getName(), invalidRecord.get(field.getName()));
    }
    emitter.emit(output.set("errMsg", input.getErrorMessage()).set("errCode", input.getErrorCode()).set("errStage", input.getStageName()).build());
}
Also used : Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Example 55 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class DelimitedStringsRecordFormatTest method testTSV.

@Test
public void testTSV() throws Exception {
    FormatSpecification spec = new FormatSpecification(Formats.TSV, null, Collections.<String, String>emptyMap());
    RecordFormat<ByteBuffer, StructuredRecord> format = RecordFormats.createInitializedFormat(spec);
    String body = "userX\tactionY\titemZ";
    StructuredRecord output = format.read(ByteBuffer.wrap(Bytes.toBytes(body)));
    String[] actual = output.get("body");
    String[] expected = body.split("\t");
    Assert.assertArrayEquals(expected, actual);
}
Also used : FormatSpecification(io.cdap.cdap.api.data.format.FormatSpecification) ByteBuffer(java.nio.ByteBuffer) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Test(org.junit.Test)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)210 Schema (io.cdap.cdap.api.data.schema.Schema)169 Test (org.junit.Test)119 Table (io.cdap.cdap.api.dataset.table.Table)76 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)73 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)73 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)68 ApplicationManager (io.cdap.cdap.test.ApplicationManager)68 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)59 WorkflowManager (io.cdap.cdap.test.WorkflowManager)54 HashSet (java.util.HashSet)50 ArrayList (java.util.ArrayList)44 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)40 HashMap (java.util.HashMap)25 File (java.io.File)17 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)16 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)15 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)14 SparkManager (io.cdap.cdap.test.SparkManager)12 Map (java.util.Map)12