use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class JoinerBridge method getJoinKeys.
@Override
public Collection<StructuredRecord> getJoinKeys(String stageName, INPUT_RECORD record) throws Exception {
if (!(record instanceof StructuredRecord)) {
// but it is technically possible.
throw new IllegalArgumentException(String.format("Received an input record of unsupported type '%s' from stage '%s'.", record.getClass().getName(), stageName));
}
List<String> key = joinKeys.get(stageName);
if (key == null) {
// this should not happen, it should be caught by the pipeline app at configure or prepare time and failed then
throw new IllegalArgumentException(String.format("Received data from stage '%s', but the stage was not included as part of the join. " + "Check the plugin to make sure it is including all input stages.", stageName));
}
StructuredRecord inputRecord = (StructuredRecord) record;
if (keySchema == null) {
keySchema = getKeySchema(stageName, inputRecord.getSchema(), key);
}
JoinDistribution distribution = joinDefinition.getDistribution();
List<StructuredRecord> keyRecords = new ArrayList<>();
StructuredRecord.Builder keyRecord = getKeyRecordBuilder(key, inputRecord);
// If distribution is not enabled then return the record without any changes
if (distribution == null) {
keyRecords.add(keyRecord.build());
return keyRecords;
}
int distributionFactor = distribution.getDistributionFactor();
// If this is the skewed stage then we need to add salt
if (stageName.equals(distribution.getSkewedStageName())) {
keyRecord.set(SALT_COLUMN, saltGenerator.nextInt(distributionFactor));
keyRecords.add(keyRecord.build());
return keyRecords;
}
// This is not the skewed stage so we need to explode it
for (int i = 0; i < distributionFactor; i++) {
StructuredRecord.Builder recordBuilder = getKeyRecordBuilder(key, inputRecord);
recordBuilder.set(SALT_COLUMN, i);
keyRecords.add(recordBuilder.build());
}
return keyRecords;
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class JoinerBridge method generateOutputSchema.
private Schema generateOutputSchema(Iterable<JoinElement<INPUT_RECORD>> elements) {
Map<String, Schema> stageSchemas = new HashMap<>();
for (JoinElement<INPUT_RECORD> joinElement : elements) {
StructuredRecord joinRecord = (StructuredRecord) joinElement.getInputRecord();
stageSchemas.put(joinElement.getStageName(), joinRecord.getSchema());
}
List<Schema.Field> fields = new ArrayList<>(joinDefinition.getSelectedFields().size());
for (JoinField joinField : joinDefinition.getSelectedFields()) {
String originalName = joinField.getFieldName();
String outputName = joinField.getAlias() == null ? originalName : joinField.getAlias();
Schema stageSchema = stageSchemas.get(joinField.getStageName());
if (stageSchema == null) {
// should not be possible, should be validated earlier
throw new IllegalArgumentException(String.format("Unable to select field '%s' from stage '%s' because data for the stage could not be found.", originalName, joinField.getStageName()));
}
Schema.Field stageField = stageSchema.getField(originalName);
if (stageField == null) {
// should not be possible, should be validated earlier
throw new IllegalArgumentException(String.format("Unable to select field '%s' from stage '%s' because the field for the stage could not be found.", originalName, joinField.getStageName()));
}
fields.add(Schema.Field.of(outputName, stageField.getSchema()));
}
return Schema.recordOf("joined", fields);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class ReflectionTableTest method testStructuredRecordProjection.
@Test
public void testStructuredRecordProjection() throws Exception {
dsFrameworkUtil.createInstance("table", users, DatasetProperties.builder().build());
try {
final Table usersTable = dsFrameworkUtil.getInstance(users);
final byte[] rowKey = Bytes.toBytes(123);
final User2 projected = new User2("Samuel L.", 123L, ((Float) 50000000.02f).doubleValue(), Double.MAX_VALUE, ByteBuffer.wrap(new byte[] { 0, 1, 2 }));
final Schema fullSchema = new ReflectionSchemaGenerator().generate(User.class);
final Schema projSchema = new ReflectionSchemaGenerator().generate(User2.class);
// TableDataset is not accessible here, but we know that's the underlying implementation...
TransactionExecutor tx = dsFrameworkUtil.newTransactionExecutor((TransactionAware) usersTable);
tx.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
Put put = new Put(rowKey);
ReflectionPutWriter<User> putWriter = new ReflectionPutWriter<>(fullSchema);
putWriter.write(SAMUEL, put);
usersTable.put(put);
Row row = usersTable.get(rowKey);
ReflectionRowRecordReader rowReader = new ReflectionRowRecordReader(projSchema, null);
StructuredRecord actual = rowReader.read(row, fullSchema);
assertRecordEqualsUser(projected, actual);
}
});
} finally {
dsFrameworkUtil.deleteInstance(users);
}
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class FlattenErrorTransform method transform.
@Override
public void transform(ErrorRecord<StructuredRecord> input, Emitter<StructuredRecord> emitter) throws Exception {
StructuredRecord invalidRecord = input.getRecord();
StructuredRecord.Builder output = StructuredRecord.builder(getOutputSchema(invalidRecord.getSchema()));
for (Schema.Field field : invalidRecord.getSchema().getFields()) {
output.set(field.getName(), invalidRecord.get(field.getName()));
}
emitter.emit(output.set("errMsg", input.getErrorMessage()).set("errCode", input.getErrorCode()).set("errStage", input.getStageName()).build());
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DelimitedStringsRecordFormatTest method testTSV.
@Test
public void testTSV() throws Exception {
FormatSpecification spec = new FormatSpecification(Formats.TSV, null, Collections.<String, String>emptyMap());
RecordFormat<ByteBuffer, StructuredRecord> format = RecordFormats.createInitializedFormat(spec);
String body = "userX\tactionY\titemZ";
StructuredRecord output = format.read(ByteBuffer.wrap(Bytes.toBytes(body)));
String[] actual = output.get("body");
String[] expected = body.split("\t");
Assert.assertArrayEquals(expected, actual);
}
Aggregations