Search in sources :

Example 1 with GenericRecordsToGcsPartitioned

use of com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitionedTest method testHourlyPartitioning.

@Test
public void testHourlyPartitioning() {
    Record record11 = new Record(SCHEMA);
    record11.put("x", true);
    record11.put("date", dateToEpochMillis(2010, 1, 1, 1));
    Record record12 = new Record(SCHEMA);
    record12.put("x", false);
    record12.put("date", dateToEpochMillis(2010, 1, 1, 1));
    Record record21 = new Record(SCHEMA);
    record21.put("x", true);
    record21.put("date", dateToEpochMillis(2010, 1, 1, 2));
    Record record31 = new Record(SCHEMA);
    record31.put("x", true);
    record31.put("date", dateToEpochMillis(2010, 1, 2, 1));
    String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
    PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21, record31).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.HOURLY, FileFormatOptions.AVRO));
    PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1/hour=1").setValues(ImmutableList.of("2010", "1", "1", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1/hour=2").setValues(ImmutableList.of("2010", "1", "1", "2")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=2/hour=1").setValues(ImmutableList.of("2010", "1", "2", "1")).build());
    mainPipeline.run();
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/hour=1/*", record11, record12);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/hour=2/*", record21);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=2/hour=1/*", record31);
}
Also used : PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Record(org.apache.avro.generic.GenericData.Record) Test(org.junit.Test)

Example 2 with GenericRecordsToGcsPartitioned

use of com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned in project DataflowTemplates by GoogleCloudPlatform.

the class DataplexJdbcIngestion method buildGcsPipeline.

@VisibleForTesting
static void buildGcsPipeline(Pipeline pipeline, DataplexJdbcIngestionOptions options, DynamicDataSourceConfiguration dataSourceConfig, String targetRootPath) {
    // Auto inferring beam schema
    Schema beamSchema = Schemas.jdbcSchemaToBeamSchema(dataSourceConfig.buildDatasource(), options.getQuery());
    // Convert to Avro Schema
    org.apache.avro.Schema avroSchema = AvroUtils.toAvroSchema(beamSchema);
    // Read from JdbcIO and convert ResultSet to Beam Row
    PCollection<Row> resultRows = pipeline.apply("Read from JdbcIO", DynamicJdbcIO.<Row>read().withDataSourceConfiguration(dataSourceConfig).withQuery(options.getQuery()).withCoder(RowCoder.of(beamSchema)).withRowMapper(BeamSchemaUtil.of(beamSchema)));
    // Convert Beam Row to GenericRecord
    PCollection<GenericRecord> genericRecords = resultRows.apply("convert to GenericRecord", ParDo.of(new BeamRowToGenericRecordFn(avroSchema))).setCoder(AvroCoder.of(avroSchema));
    // Write to GCS bucket
    PCollection<PartitionMetadata> metadata = genericRecords.apply("Write to GCS", new GenericRecordsToGcsPartitioned(targetRootPath, Schemas.serialize(avroSchema), options.getParitionColumn(), options.getPartitioningScheme(), options.getFileFormat()));
}
Also used : BeamRowToGenericRecordFn(com.google.cloud.teleport.v2.transforms.BeamRowToGenericRecordFn) GenericRecordsToGcsPartitioned(com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned) Schema(org.apache.beam.sdk.schemas.Schema) PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) TableRow(com.google.api.services.bigquery.model.TableRow) Row(org.apache.beam.sdk.values.Row) GenericRecord(org.apache.avro.generic.GenericRecord) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with GenericRecordsToGcsPartitioned

use of com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitionedTest method testDailyPartitioning.

@Test
public void testDailyPartitioning() {
    Record record11 = new Record(SCHEMA);
    record11.put("x", true);
    record11.put("date", dateToEpochMillis(2010, 1, 1));
    Record record12 = new Record(SCHEMA);
    record12.put("x", false);
    record12.put("date", dateToEpochMillis(2010, 1, 1));
    Record record21 = new Record(SCHEMA);
    record21.put("x", true);
    record21.put("date", dateToEpochMillis(2010, 1, 2));
    Record record31 = new Record(SCHEMA);
    record31.put("x", true);
    record31.put("date", dateToEpochMillis(2010, 2, 1));
    String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
    PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21, record31).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.DAILY, FileFormatOptions.AVRO));
    PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=1").setValues(ImmutableList.of("2010", "1", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1/day=2").setValues(ImmutableList.of("2010", "1", "2")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=2/day=1").setValues(ImmutableList.of("2010", "2", "1")).build());
    mainPipeline.run();
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=1/*", record11, record12);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/day=2/*", record21);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=2/day=1/*", record31);
}
Also used : PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Record(org.apache.avro.generic.GenericData.Record) Test(org.junit.Test)

Example 4 with GenericRecordsToGcsPartitioned

use of com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned in project DataflowTemplates by GoogleCloudPlatform.

the class GenericRecordsToGcsPartitionedTest method testMonthlyPartitioning.

@Test
public void testMonthlyPartitioning() {
    Record record11 = new Record(SCHEMA);
    record11.put("x", true);
    record11.put("date", dateToEpochMillis(2010, 1));
    Record record12 = new Record(SCHEMA);
    record12.put("x", false);
    record12.put("date", dateToEpochMillis(2010, 1));
    Record record21 = new Record(SCHEMA);
    record21.put("x", true);
    record21.put("date", dateToEpochMillis(2010, 2));
    String tmpRootPath = temporaryFolder.getRoot().getAbsolutePath();
    PCollection<PartitionMetadata> result = mainPipeline.apply(Create.<GenericRecord>of(record11, record12, record21).withCoder(AvroCoder.of(SCHEMA))).apply("GenericRecordsToGCS", new GenericRecordsToGcsPartitioned(tmpRootPath, SERIALIZED_SCHEMA, "date", PartitioningSchema.MONTHLY, FileFormatOptions.AVRO));
    PAssert.that(result).containsInAnyOrder(PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=1").setValues(ImmutableList.of("2010", "1")).build(), PartitionMetadata.builder().setLocation(tmpRootPath + "/year=2010/month=2").setValues(ImmutableList.of("2010", "2")).build());
    mainPipeline.run();
    verifyRecordsExists(tmpRootPath + "/year=2010/month=1/*", record11, record12);
    verifyRecordsExists(tmpRootPath + "/year=2010/month=2/*", record21);
}
Also used : PartitionMetadata(com.google.cloud.teleport.v2.values.PartitionMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Record(org.apache.avro.generic.GenericData.Record) Test(org.junit.Test)

Aggregations

PartitionMetadata (com.google.cloud.teleport.v2.values.PartitionMetadata)4 GenericRecord (org.apache.avro.generic.GenericRecord)4 Record (org.apache.avro.generic.GenericData.Record)3 Test (org.junit.Test)3 TableRow (com.google.api.services.bigquery.model.TableRow)1 BeamRowToGenericRecordFn (com.google.cloud.teleport.v2.transforms.BeamRowToGenericRecordFn)1 GenericRecordsToGcsPartitioned (com.google.cloud.teleport.v2.transforms.GenericRecordsToGcsPartitioned)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Schema (org.apache.beam.sdk.schemas.Schema)1 Row (org.apache.beam.sdk.values.Row)1