use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class SelectHelpersTest method testSelectIterableOfRow.
@Test
public void testSelectIterableOfRow() {
FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldNames("rowIter").resolve(ITERABLE_SCHEMA);
Schema outputSchema = SelectHelpers.getOutputSchema(ITERABLE_SCHEMA, fieldAccessDescriptor);
Schema expectedSchema = Schema.builder().addIterableField("rowIter", FieldType.row(FLAT_SCHEMA)).build();
assertEquals(expectedSchema, outputSchema);
Row row = selectRow(ITERABLE_SCHEMA, fieldAccessDescriptor, ITERABLE_ROW);
Row expectedRow = Row.withSchema(expectedSchema).addIterable(ImmutableList.of(FLAT_ROW, FLAT_ROW)).build();
assertEquals(expectedRow, row);
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class SelectHelpersTest method testNullableSchemaMap.
@Test
public void testNullableSchemaMap() {
FieldAccessDescriptor fieldAccessDescriptor1 = FieldAccessDescriptor.withFieldNames("nestedMap.field1").resolve(NESTED_NULLABLE_SCHEMA);
Schema schema1 = SelectHelpers.getOutputSchema(NESTED_NULLABLE_SCHEMA, fieldAccessDescriptor1);
Schema expectedSchema1 = Schema.builder().addNullableField("field1", FieldType.map(FieldType.STRING, FieldType.STRING)).build();
assertEquals(expectedSchema1, schema1);
FieldAccessDescriptor fieldAccessDescriptor2 = FieldAccessDescriptor.withFieldNames("nestedMap.*").resolve(NESTED_NULLABLE_SCHEMA);
Schema schema2 = SelectHelpers.getOutputSchema(NESTED_NULLABLE_SCHEMA, fieldAccessDescriptor2);
Schema expectedSchema2 = Schema.builder().addNullableField("field1", FieldType.map(FieldType.STRING, FieldType.STRING)).addNullableField("field2", FieldType.map(FieldType.STRING, FieldType.INT32)).addNullableField("field3", FieldType.map(FieldType.STRING, FieldType.DOUBLE)).addNullableField("field_extra", FieldType.map(FieldType.STRING, FieldType.STRING)).build();
assertEquals(expectedSchema2, schema2);
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class SelectHelpers method selectIntoRowWithQualifiers.
private static void selectIntoRowWithQualifiers(List<Qualifier> qualifiers, int qualifierPosition, Object value, Row.Builder output, FieldAccessDescriptor fieldAccessDescriptor, FieldType inputType, FieldType outputType) {
if (qualifierPosition >= qualifiers.size()) {
// We have already constructed all arrays and maps. What remains must be a Row.
Row row = (Row) value;
selectIntoRow(inputType.getRowSchema(), row, output, fieldAccessDescriptor);
return;
}
Qualifier qualifier = qualifiers.get(qualifierPosition);
switch(qualifier.getKind()) {
case LIST:
{
FieldType nestedInputType = checkNotNull(inputType.getCollectionElementType());
FieldType nestedOutputType = checkNotNull(outputType.getCollectionElementType());
Iterable<Object> iterable = (Iterable) value;
// When selecting multiple subelements under a list, we distribute the select
// resulting in multiple lists. For example, if there is a field "list" with type
// {a: string, b: int}[], selecting list.a, list.b results in a schema of type
// {a: string[], b: int[]}. This preserves the invariant that the name selected always
// appears in the top-level schema.
Schema tempSchema = Schema.builder().addField("a", nestedInputType).build();
FieldAccessDescriptor tempAccessDescriptor = FieldAccessDescriptor.create().withNestedField("a", fieldAccessDescriptor).resolve(tempSchema);
Schema nestedSchema = getOutputSchema(tempSchema, tempAccessDescriptor);
List<List<Object>> selectedLists = Lists.newArrayListWithCapacity(nestedSchema.getFieldCount());
for (int i = 0; i < nestedSchema.getFieldCount(); i++) {
if (iterable == null) {
selectedLists.add(null);
} else {
selectedLists.add(Lists.newArrayListWithCapacity(Iterables.size(iterable)));
}
}
if (iterable != null) {
for (Object o : iterable) {
Row.Builder selectElementBuilder = Row.withSchema(nestedSchema);
selectIntoRowWithQualifiers(qualifiers, qualifierPosition + 1, o, selectElementBuilder, fieldAccessDescriptor, nestedInputType, nestedOutputType);
Row elementBeforeDistribution = selectElementBuilder.build();
for (int i = 0; i < nestedSchema.getFieldCount(); ++i) {
selectedLists.get(i).add(elementBeforeDistribution.getValue(i));
}
}
}
for (List aList : selectedLists) {
output.addValue(aList);
}
break;
}
case MAP:
{
FieldType nestedInputType = checkNotNull(inputType.getMapValueType());
FieldType nestedOutputType = checkNotNull(outputType.getMapValueType());
// When selecting multiple subelements under a map, we distribute the select
// resulting in multiple maps. The semantics are the same as for lists above (except we
// only support subelement select for map values, not for map keys).
Schema tempSchema = Schema.builder().addField("a", nestedInputType).build();
FieldAccessDescriptor tempAccessDescriptor = FieldAccessDescriptor.create().withNestedField("a", fieldAccessDescriptor).resolve(tempSchema);
Schema nestedSchema = getOutputSchema(tempSchema, tempAccessDescriptor);
List<Map> selectedMaps = Lists.newArrayListWithExpectedSize(nestedSchema.getFieldCount());
for (int i = 0; i < nestedSchema.getFieldCount(); ++i) {
if (value == null) {
selectedMaps.add(null);
} else {
selectedMaps.add(Maps.newHashMap());
}
}
if (value != null) {
Map<Object, Object> map = (Map) value;
for (Map.Entry<Object, Object> entry : map.entrySet()) {
Row.Builder selectValueBuilder = Row.withSchema(nestedSchema);
selectIntoRowWithQualifiers(qualifiers, qualifierPosition + 1, entry.getValue(), selectValueBuilder, fieldAccessDescriptor, nestedInputType, nestedOutputType);
Row valueBeforeDistribution = selectValueBuilder.build();
for (int i = 0; i < nestedSchema.getFieldCount(); ++i) {
selectedMaps.get(i).put(entry.getKey(), valueBeforeDistribution.getValue(i));
}
}
}
for (Map aMap : selectedMaps) {
output.addValue(aMap);
}
break;
}
default:
throw new RuntimeException("Unexpected type " + qualifier.getKind());
}
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class Cast method expand.
@Override
public PCollection<Row> expand(PCollection<T> input) {
Schema inputSchema = input.getSchema();
verifyCompatibility(inputSchema);
return input.apply(ParDo.of(new DoFn<T, Row>() {
// TODO: This should be the same as resolved so that Beam knows which fields
// are being accessed. Currently Beam only supports wildcard descriptors.
// Once BEAM-4457 is fixed, fix this.
@FieldAccess("filterFields")
final FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withAllFields();
@ProcessElement
public void process(@FieldAccess("filterFields") @Element Row input, OutputReceiver<Row> r) {
Row output = castRow(input, inputSchema, outputSchema());
r.output(output);
}
})).setRowSchema(outputSchema());
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class ParDoSchemaTest method testFieldAccessSchemaPipeline.
@Test
@Category(ValidatesRunner.class)
public void testFieldAccessSchemaPipeline() {
List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3));
Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build();
PCollection<String> output = pipeline.apply(Create.of(pojoList).withSchema(schema, TypeDescriptor.of(MyPojo.class), o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))).apply(ParDo.of(new DoFn<MyPojo, String>() {
@FieldAccess("foo")
final FieldAccessDescriptor fieldAccess = FieldAccessDescriptor.withAllFields();
@ProcessElement
public void process(@FieldAccess("foo") Row row, OutputReceiver<String> r) {
r.output(row.getString(0) + ":" + row.getInt32(1));
}
}));
PAssert.that(output).containsInAnyOrder("a:1", "b:2", "c:3");
pipeline.run();
}
Aggregations