Search in sources :

Example 1 with PartitionMetadata

use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitionedTest method testHourlyPartitioning.

@Test
public void testHourlyPartitioning() {
    Record record11 = new Record(SCHEMA);
    record11.put("x", true);
    record11.put("date", dateToEpochMillis(2010, 1, 1, 1));
    Record record12 = new Record(SCHEMA);
    record12.put("x", false);
    record12.put("date", dateToEpochMillis(2010, 1, 1, 1));
    Record record21 = new Record(SCHEMA);
    record21.put("x", true);
    record21.put("date", dateToEpochMillis(2010, 1, 1, 2));
    Record record31 = new Record(SCHEMA);
    record31.put("x", true);
    record31.put("date", dateToEpochMillis(2010, 1, 2, 1));
    String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
    PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21, record31).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.HOURLY, FileFormatOptions.AVRO));
    PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1/hour=1").setValues(ImmutableList.of("2010", "1", "1", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1/hour=2").setValues(ImmutableList.of("2010", "1", "1", "2")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=2/hour=1").setValues(ImmutableList.of("2010", "1", "2", "1")).build());
    mainPipeline.run();
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/hour=1/*", record11, record12);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/hour=2/*", record21);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=2/hour=1/*", record31);
}
Also used : PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Record(org.apache.avro.generic.GenericData.Record) Test(org.junit.Test)

Example 2 with PartitionMetadata

use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexJdbcIngestion method buildGcsPipeline.

@VisibleForTesting
static void buildGcsPipeline(Pipeline pipeline, DataplexJdbcIngestionOptions options, DynamicDataSourceConfiguration dataSourceConfig, String targetRootPath) {
    // Auto inferring beam schema
    Schema beamSchema = Schemas.jdbcSchemaToBeamSchema(dataSourceConfig.buildDatasource(), options.getQuery());
    // Convert to Avro Schema
    org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema);
    // Read from JdbcIO and convert ResultSet to Beam Row
    PCollection<Row> resultRows = pipeline.apply("Read from JdbcIO", DynamicJdbcIO.<Row>read().withDataSourceConfiguration(dataSourceConfig).withQuery(options.getQuery()).withCoder(RowCoder.of(beamSchema)).withRowMapper(BeamSchemaUtil.of(beamSchema)));
    // Convert Beam Row to GenericRecord
    PCollection<GenericRecord> genericRecords = resultRows.apply("convert to GenericRecord", ParDo.of(new BeamRowToGenericRecordFn(avroSchema))).setCoder(AvroCoder.of(avroSchema));
    // Write to GCS bucket
    PCollection<PartitionMetadata> metadata = genericRecords.apply("Write to GCS", new GenericRecordsToGcsPartitioned(targetRootPath, Schemas.serialize(avroSchema), options.getParitionColumn(), options.getPartitioningScheme(), options.getFileFormat()));
}
Also used : BeamRowToGenericRecordFn(com.google.cloud.teleport.v2.transforms.BeamRowToGenericRecordFn) GenericRecordsToGcsPartitioned(com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned) Schema(org.apache.beam.sdk.schemas.Schema) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) TableRow(com.google.api.services.bigquery.model.TableRow) Row(org.apache.beam.sdk.values.Row) GenericRecord(org.apache.avro.generic.GenericRecord) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with PartitionMetadata

use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitioned method expand.

@Override
public PCollection<PartitionMetadata> expand(PCollection<GenericRecord> input) {
    Schema schema = SchemaUtils.parseAvroSchema(serializedAvroSchema);
    Sink<GenericRecord> sink;
    switch(outputFileFormat) {
        case PARQUET:
            sink = ParquetIO.sink(schema);
            break;
        case AVRO:
            sink = new AvroSinkWithJodaDatesConversion<>(schema);
            break;
        default:
            throw new UnsupportedOperationException("Output format is not implemented: " + outputFileFormat);
    }
    if (partitionColumnName == null || partitioningSchema == null) {
        LOG.info("PartitionColumnName or/and PartitioningSchema not provided. " + "Writing to GCS without partition");
        return input.apply("Write to Storage with No Partition", FileIO.<GenericRecord>write().withSuffix(outputFileFormat.getFileSuffix()).via(sink).to(gcsPath)).getPerDestinationOutputFilenames().apply("MapFileNames", MapElements.into(TypeDescriptors.strings()).via((SerializableFunction<KV<Void, String>, String>) KV::getValue)).apply(MapElements.via(new SimpleFunction<String, PartitionMetadata>() {

            @Override
            public PartitionMetadata apply(String path) {
                return PartitionMetadata.builder().setValues(ImmutableList.of("1")).setLocation(withoutFileName(path)).build();
            }
        }));
    }
    ZoneId zoneId = getZoneId(schema);
    return input.apply(FileIO.<List<KV<String, Integer>>, GenericRecord>writeDynamic().by((GenericRecord r) -> partitioningSchema.toPartition(Instant.ofEpochMilli(partitionColumnValueToMillis(r.get(partitionColumnName))).atZone(zoneId))).withDestinationCoder(ListCoder.of(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))).via(sink).to(gcsPath).withNumShards(// must be 1 as we can only have 1 file per Dataplex partition
    1).withNaming(p -> Write.defaultNaming(partitionToPath(p), outputFileFormat.getFileSuffix()))).getPerDestinationOutputFilenames().apply(MapElements.via(new SimpleFunction<KV<List<KV<String, Integer>>, String>, PartitionMetadata>() {

        @Override
        public PartitionMetadata apply(KV<List<KV<String, Integer>>, String> partitionAndPath) {
            if (partitionAndPath.getKey() == null) {
                throw new IllegalStateException("Partition is null for path " + partitionAndPath.getValue());
            }
            if (partitionAndPath.getValue() == null) {
                throw new IllegalStateException("Path is null for partition " + partitionAndPath.getKey());
            }
            return PartitionMetadata.builder().setValues(partitionAndPath.getKey().stream().map(e -> String.valueOf(e.getValue())).collect(toImmutableList())).setLocation(withoutFileName(partitionAndPath.getValue())).build();
        }
    }));
}
Also used : FileIO(org.apache.beam.sdk.io.FileIO) KV(org.apache.beam.sdk.values.KV) ZonedDateTime(java.time.ZonedDateTime) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) Sink(org.apache.beam.sdk.io.FileIO.Sink) LoggerFactory(org.slf4j.LoggerFactory) ListCoder(org.apache.beam.sdk.coders.ListCoder) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Function(java.util.function.Function) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) PTransform(org.apache.beam.sdk.transforms.PTransform) ImmutableList(com.google.common.collect.ImmutableList) LogicalTypes(org.apache.avro.LogicalTypes) Write(org.apache.beam.sdk.io.FileIO.Write) FileFormatOptions(com.google.cloud.teleport.v2.utils.FileFormat.FileFormatOptions) ZoneOffset(java.time.ZoneOffset) Nullable(javax.annotation.Nullable) MapElements(org.apache.beam.sdk.transforms.MapElements) GenericRecord(org.apache.avro.generic.GenericRecord) KvCoder(org.apache.beam.sdk.coders.KvCoder) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) ImmutableMap(com.google.common.collect.ImmutableMap) LogicalType(org.apache.avro.LogicalType) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReadableInstant(org.joda.time.ReadableInstant) Instant(java.time.Instant) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) ParquetIO(org.apache.beam.sdk.io.parquet.ParquetIO) ZoneId(java.time.ZoneId) List(java.util.List) SchemaUtils(com.google.cloud.teleport.v2.utils.SchemaUtils) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) AvroSinkWithJodaDatesConversion(com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion) ZoneId(java.time.ZoneId) Schema(org.apache.avro.Schema) KV(org.apache.beam.sdk.values.KV) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 4 with PartitionMetadata

use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitionedTest method testDailyPartitioning.

@Test
public void testDailyPartitioning() {
    Record record11 = new Record(SCHEMA);
    record11.put("x", true);
    record11.put("date", dateToEpochMillis(2010, 1, 1));
    Record record12 = new Record(SCHEMA);
    record12.put("x", false);
    record12.put("date", dateToEpochMillis(2010, 1, 1));
    Record record21 = new Record(SCHEMA);
    record21.put("x", true);
    record21.put("date", dateToEpochMillis(2010, 1, 2));
    Record record31 = new Record(SCHEMA);
    record31.put("x", true);
    record31.put("date", dateToEpochMillis(2010, 2, 1));
    String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
    PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21, record31).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.DAILY, FileFormatOptions.AVRO));
    PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1").setValues(ImmutableList.of("2010", "1", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=2").setValues(ImmutableList.of("2010", "1", "2")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=2/day=1").setValues(ImmutableList.of("2010", "2", "1")).build());
    mainPipeline.run();
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/*", record11, record12);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=2/*", record21);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=2/day=1/*", record31);
}
Also used : PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Record(org.apache.avro.generic.GenericData.Record) Test(org.junit.Test)

Example 5 with PartitionMetadata

use of com.google.cloud.teleport.v2.values.PartitionMetadata in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitionedTest method testMonthlyPartitioning.

@Test
public void testMonthlyPartitioning() {
    Record record11 = new Record(SCHEMA);
    record11.put("x", true);
    record11.put("date", dateToEpochMillis(2010, 1));
    Record record12 = new Record(SCHEMA);
    record12.put("x", false);
    record12.put("date", dateToEpochMillis(2010, 1));
    Record record21 = new Record(SCHEMA);
    record21.put("x", true);
    record21.put("date", dateToEpochMillis(2010, 2));
    String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
    PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.MONTHLY, FileFormatOptions.AVRO));
    PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1").setValues(ImmutableList.of("2010", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=2").setValues(ImmutableList.of("2010", "2")).build());
    mainPipeline.run();
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/*", record11, record12);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=2/*", record21);
}
Also used : PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Record(org.apache.avro.generic.GenericData.Record) Test(org.junit.Test)

Aggregations

PartitionMetadata (com.google.cloud.teleport.v2.values.PartitionMetadata)5 GenericRecord (org.apache.avro.generic.GenericRecord)5 Record (org.apache.avro.generic.GenericData.Record)3 Test (org.junit.Test)3 TableRow (com.google.api.services.bigquery.model.TableRow)1 AvroSinkWithJodaDatesConversion (com.google.cloud.teleport.v2.io.AvroSinkWithJodaDatesConversion)1 BeamRowToGenericRecordFn (com.google.cloud.teleport.v2.transforms.BeamRowToGenericRecordFn)1 GenericRecordsToGcsPartitioned (com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned)1 FileFormatOptions (com.google.cloud.teleport.v2.utils.FileFormat.FileFormatOptions)1 SchemaUtils (com.google.cloud.teleport.v2.utils.SchemaUtils)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Instant (java.time.Instant)1 ZoneId (java.time.ZoneId)1 ZoneOffset (java.time.ZoneOffset)1 ZonedDateTime (java.time.ZonedDateTime)1 List (java.util.List)1 Function (java.util.function.Function)1