Examples with FieldAccessDescriptor - org.apache.beam.sdk.schemas.FieldAccessDescriptor

Example 16 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class SelectHelpersTest method testSelectIterableOfRow.

@Test
public void testSelectIterableOfRow() {
    FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withFieldNames("rowIter").resolve(ITERABLE_SCHEMA);
    Schema outputSchema = SelectHelpers.getOutputSchema(ITERABLE_SCHEMA, fieldAccessDescriptor);
    Schema expectedSchema = Schema.builder().addIterableField("rowIter", FieldType.row(FLAT_SCHEMA)).build();
    assertEquals(expectedSchema, outputSchema);
    Row row = selectRow(ITERABLE_SCHEMA, fieldAccessDescriptor, ITERABLE_ROW);
    Row expectedRow = Row.withSchema(expectedSchema).addIterable(ImmutableList.of(FLAT_ROW, FLAT_ROW)).build();
    assertEquals(expectedRow, row);
}

Also used : FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Schema(org.apache.beam.sdk.schemas.Schema) Row(org.apache.beam.sdk.values.Row) Test(org.junit.Test)

Example 17 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class SelectHelpersTest method testNullableSchemaMap.

@Test
public void testNullableSchemaMap() {
    FieldAccessDescriptor fieldAccessDescriptor1 = FieldAccessDescriptor.withFieldNames("nestedMap.field1").resolve(NESTED_NULLABLE_SCHEMA);
    Schema schema1 = SelectHelpers.getOutputSchema(NESTED_NULLABLE_SCHEMA, fieldAccessDescriptor1);
    Schema expectedSchema1 = Schema.builder().addNullableField("field1", FieldType.map(FieldType.STRING, FieldType.STRING)).build();
    assertEquals(expectedSchema1, schema1);
    FieldAccessDescriptor fieldAccessDescriptor2 = FieldAccessDescriptor.withFieldNames("nestedMap.*").resolve(NESTED_NULLABLE_SCHEMA);
    Schema schema2 = SelectHelpers.getOutputSchema(NESTED_NULLABLE_SCHEMA, fieldAccessDescriptor2);
    Schema expectedSchema2 = Schema.builder().addNullableField("field1", FieldType.map(FieldType.STRING, FieldType.STRING)).addNullableField("field2", FieldType.map(FieldType.STRING, FieldType.INT32)).addNullableField("field3", FieldType.map(FieldType.STRING, FieldType.DOUBLE)).addNullableField("field_extra", FieldType.map(FieldType.STRING, FieldType.STRING)).build();
    assertEquals(expectedSchema2, schema2);
}

Also used : FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Schema(org.apache.beam.sdk.schemas.Schema) Test(org.junit.Test)

Example 18 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class SelectHelpers method selectIntoRowWithQualifiers.

private static void selectIntoRowWithQualifiers(List<Qualifier> qualifiers, int qualifierPosition, Object value, Row.Builder output, FieldAccessDescriptor fieldAccessDescriptor, FieldType inputType, FieldType outputType) {
    if (qualifierPosition >= qualifiers.size()) {
        // We have already constructed all arrays and maps. What remains must be a Row.
        Row row = (Row) value;
        selectIntoRow(inputType.getRowSchema(), row, output, fieldAccessDescriptor);
        return;
    }
    Qualifier qualifier = qualifiers.get(qualifierPosition);
    switch(qualifier.getKind()) {
        case LIST:
            {
                FieldType nestedInputType = checkNotNull(inputType.getCollectionElementType());
                FieldType nestedOutputType = checkNotNull(outputType.getCollectionElementType());
                Iterable<Object> iterable = (Iterable) value;
                // When selecting multiple subelements under a list, we distribute the select
                // resulting in multiple lists. For example, if there is a field "list" with type
                // {a: string, b: int}[], selecting list.a, list.b results in a schema of type
                // {a: string[], b: int[]}. This preserves the invariant that the name selected always
                // appears in the top-level schema.
                Schema tempSchema = Schema.builder().addField("a", nestedInputType).build();
                FieldAccessDescriptor tempAccessDescriptor = FieldAccessDescriptor.create().withNestedField("a", fieldAccessDescriptor).resolve(tempSchema);
                Schema nestedSchema = getOutputSchema(tempSchema, tempAccessDescriptor);
                List<List<Object>> selectedLists = Lists.newArrayListWithCapacity(nestedSchema.getFieldCount());
                for (int i = 0; i < nestedSchema.getFieldCount(); i++) {
                    if (iterable == null) {
                        selectedLists.add(null);
                    } else {
                        selectedLists.add(Lists.newArrayListWithCapacity(Iterables.size(iterable)));
                    }
                }
                if (iterable != null) {
                    for (Object o : iterable) {
                        Row.Builder selectElementBuilder = Row.withSchema(nestedSchema);
                        selectIntoRowWithQualifiers(qualifiers, qualifierPosition + 1, o, selectElementBuilder, fieldAccessDescriptor, nestedInputType, nestedOutputType);
                        Row elementBeforeDistribution = selectElementBuilder.build();
                        for (int i = 0; i < nestedSchema.getFieldCount(); ++i) {
                            selectedLists.get(i).add(elementBeforeDistribution.getValue(i));
                        }
                    }
                }
                for (List aList : selectedLists) {
                    output.addValue(aList);
                }
                break;
            }
        case MAP:
            {
                FieldType nestedInputType = checkNotNull(inputType.getMapValueType());
                FieldType nestedOutputType = checkNotNull(outputType.getMapValueType());
                // When selecting multiple subelements under a map, we distribute the select
                // resulting in multiple maps. The semantics are the same as for lists above (except we
                // only support subelement select for map values, not for map keys).
                Schema tempSchema = Schema.builder().addField("a", nestedInputType).build();
                FieldAccessDescriptor tempAccessDescriptor = FieldAccessDescriptor.create().withNestedField("a", fieldAccessDescriptor).resolve(tempSchema);
                Schema nestedSchema = getOutputSchema(tempSchema, tempAccessDescriptor);
                List<Map> selectedMaps = Lists.newArrayListWithExpectedSize(nestedSchema.getFieldCount());
                for (int i = 0; i < nestedSchema.getFieldCount(); ++i) {
                    if (value == null) {
                        selectedMaps.add(null);
                    } else {
                        selectedMaps.add(Maps.newHashMap());
                    }
                }
                if (value != null) {
                    Map<Object, Object> map = (Map) value;
                    for (Map.Entry<Object, Object> entry : map.entrySet()) {
                        Row.Builder selectValueBuilder = Row.withSchema(nestedSchema);
                        selectIntoRowWithQualifiers(qualifiers, qualifierPosition + 1, entry.getValue(), selectValueBuilder, fieldAccessDescriptor, nestedInputType, nestedOutputType);
                        Row valueBeforeDistribution = selectValueBuilder.build();
                        for (int i = 0; i < nestedSchema.getFieldCount(); ++i) {
                            selectedMaps.get(i).put(entry.getKey(), valueBeforeDistribution.getValue(i));
                        }
                    }
                }
                for (Map aMap : selectedMaps) {
                    output.addValue(aMap);
                }
                break;
            }
        default:
            throw new RuntimeException("Unexpected type " + qualifier.getKind());
    }
}

Also used : FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Schema(org.apache.beam.sdk.schemas.Schema) MapQualifier(org.apache.beam.sdk.schemas.FieldAccessDescriptor.FieldDescriptor.MapQualifier) ListQualifier(org.apache.beam.sdk.schemas.FieldAccessDescriptor.FieldDescriptor.ListQualifier) Qualifier(org.apache.beam.sdk.schemas.FieldAccessDescriptor.FieldDescriptor.Qualifier) List(java.util.List) Row(org.apache.beam.sdk.values.Row) Map(java.util.Map) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType)

Example 19 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class Cast method expand.

@Override
public PCollection<Row> expand(PCollection<T> input) {
    Schema inputSchema = input.getSchema();
    verifyCompatibility(inputSchema);
    return input.apply(ParDo.of(new DoFn<T, Row>() {

        // TODO: This should be the same as resolved so that Beam knows which fields
        // are being accessed. Currently Beam only supports wildcard descriptors.
        // Once BEAM-4457 is fixed, fix this.
        @FieldAccess("filterFields")
        final FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withAllFields();

        @ProcessElement
        public void process(@FieldAccess("filterFields") @Element Row input, OutputReceiver<Row> r) {
            Row output = castRow(input, inputSchema, outputSchema());
            r.output(output);
        }
    })).setRowSchema(outputSchema());
}

Also used : FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Schema(org.apache.beam.sdk.schemas.Schema) Row(org.apache.beam.sdk.values.Row)

Example 20 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class ParDoSchemaTest method testFieldAccessSchemaPipeline.

@Test
@Category(ValidatesRunner.class)
public void testFieldAccessSchemaPipeline() {
    List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3));
    Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build();
    PCollection<String> output = pipeline.apply(Create.of(pojoList).withSchema(schema, TypeDescriptor.of(MyPojo.class), o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))).apply(ParDo.of(new DoFn<MyPojo, String>() {

        @FieldAccess("foo")
        final FieldAccessDescriptor fieldAccess = FieldAccessDescriptor.withAllFields();

        @ProcessElement
        public void process(@FieldAccess("foo") Row row, OutputReceiver<String> r) {
            r.output(row.getString(0) + ":" + row.getInt32(1));
        }
    }));
    PAssert.that(output).containsInAnyOrder("a:1", "b:2", "c:3");
    pipeline.run();
}

Also used : FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) AutoValueSchema(org.apache.beam.sdk.schemas.AutoValueSchema) Schema(org.apache.beam.sdk.schemas.Schema) UsesSchema(org.apache.beam.sdk.testing.UsesSchema) DefaultSchema(org.apache.beam.sdk.schemas.annotations.DefaultSchema) Row(org.apache.beam.sdk.values.Row) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Aggregations

FieldAccessDescriptor (org.apache.beam.sdk.schemas.FieldAccessDescriptor)65 Test (org.junit.Test)49 Row (org.apache.beam.sdk.values.Row)47 Schema (org.apache.beam.sdk.schemas.Schema)42 PCollection (org.apache.beam.sdk.values.PCollection)16 Map (java.util.Map)12 Pipeline (org.apache.beam.sdk.Pipeline)11 ProjectionProducer (org.apache.beam.sdk.schemas.ProjectionProducer)9 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)8 ParDo (org.apache.beam.sdk.transforms.ParDo)5 DoFnSchemaInformation (org.apache.beam.sdk.transforms.DoFnSchemaInformation)4 PBegin (org.apache.beam.sdk.values.PBegin)4 DefaultTableFilter (org.apache.beam.sdk.extensions.sql.meta.DefaultTableFilter)3 FieldType (org.apache.beam.sdk.schemas.Schema.FieldType)3 PTransform (org.apache.beam.sdk.transforms.PTransform)3 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 AutoValueSchema (org.apache.beam.sdk.schemas.AutoValueSchema)2 FieldDescriptor (org.apache.beam.sdk.schemas.FieldAccessDescriptor.FieldDescriptor)2 Field (org.apache.beam.sdk.schemas.Schema.Field)2