Search in sources :

Example 86 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class EnvelopePayloadConverterTest method testConverter.

@Test
public void testConverter() throws IOException, DataConversionException, SchemaRegistryException {
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/envelope.avsc"));
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(inputSchema);
    File tmp = File.createTempFile(getClass().getSimpleName(), null);
    FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/envelope.avro"), tmp);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(tmp, datumReader);
    GenericRecord inputRecord = dataFileReader.next();
    Schema latestPayloadSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/record.avsc"));
    when(mockRegistry.getLatestSchemaByTopic(any())).thenReturn(latestPayloadSchema);
    when(mockRegistry.getSchemaByKey(any())).thenReturn(inputSchema.getField("nestedRecord").schema());
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_TOPIC, "test");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_ID_FIELD, "metadata.payloadSchemaId");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.KAFKA_REGISTRY_FACTORY, MockKafkaAvroSchemaRegistryFactory.class.getName());
    EnvelopePayloadConverter converter = new EnvelopePayloadConverter();
    converter.init(workUnitState);
    Schema outputSchema = converter.convertSchema(inputSchema, workUnitState);
    List<GenericRecord> outputRecords = new ArrayList<>();
    Iterables.addAll(outputRecords, converter.convertRecord(outputSchema, inputRecord, workUnitState));
    Assert.assertTrue(outputRecords.size() == 1);
    GenericRecord outputRecord = outputRecords.get(0);
    GenericRecord payload = (GenericRecord) outputRecord.get("payload");
    // While making the test envelope avro record, its nestedRecord was intentionally set to the deserialized payload
    GenericRecord expectedPayload = (GenericRecord) outputRecord.get("nestedRecord");
    Schema payloadSchema = payload.getSchema();
    Schema expectedPayloadSchema = expectedPayload.getSchema();
    // The expected payload schema has the same number of fields as payload schema but in different order
    Assert.assertTrue(expectedPayloadSchema.getName().equals(payloadSchema.getName()));
    Assert.assertTrue(expectedPayloadSchema.getNamespace().equals(payloadSchema.getNamespace()));
    Assert.assertTrue(expectedPayloadSchema.getFields().size() == payloadSchema.getFields().size());
    for (Schema.Field field : payload.getSchema().getFields()) {
        Assert.assertTrue(expectedPayload.get(field.name()).equals(payload.get(field.name())));
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.testng.annotations.Test)

Example 87 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class EnvelopePayloadExtractingConverterTest method testConverter.

@Test
public void testConverter() throws Exception {
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/envelope.avsc"));
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(inputSchema);
    File tmp = File.createTempFile(getClass().getSimpleName(), null);
    FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/envelope.avro"), tmp);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(tmp, datumReader);
    GenericRecord inputRecord = dataFileReader.next();
    Schema latestPayloadSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/record.avsc"));
    when(mockRegistry.getLatestSchemaByTopic(any())).thenReturn(latestPayloadSchema);
    when(mockRegistry.getSchemaByKey(any())).thenReturn(inputSchema.getField("nestedRecord").schema());
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_TOPIC, "test");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_ID_FIELD, "metadata.payloadSchemaId");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.KAFKA_REGISTRY_FACTORY, EnvelopePayloadExtractingConverterTest.MockKafkaAvroSchemaRegistryFactory.class.getName());
    EnvelopePayloadExtractingConverter converter = new EnvelopePayloadExtractingConverter();
    converter.init(workUnitState);
    Schema outputSchema = converter.convertSchema(inputSchema, workUnitState);
    Assert.assertTrue(outputSchema.equals(latestPayloadSchema));
    List<GenericRecord> outputRecords = new ArrayList<>();
    Iterables.addAll(outputRecords, converter.convertRecord(outputSchema, inputRecord, workUnitState));
    Assert.assertTrue(outputRecords.size() == 1);
    GenericRecord payload = outputRecords.get(0);
    // While making the test envelope avro input record, its nestedRecord was intentionally set to the deserialized payload
    GenericRecord expectedPayload = (GenericRecord) inputRecord.get("nestedRecord");
    Schema payloadSchema = payload.getSchema();
    Schema expectedPayloadSchema = expectedPayload.getSchema();
    // The expected payload schema has the same number of fields as payload schema but in different order
    Assert.assertTrue(expectedPayloadSchema.getName().equals(payloadSchema.getName()));
    Assert.assertTrue(expectedPayloadSchema.getNamespace().equals(payloadSchema.getNamespace()));
    Assert.assertTrue(expectedPayloadSchema.getFields().size() == payloadSchema.getFields().size());
    for (Schema.Field field : payload.getSchema().getFields()) {
        Assert.assertTrue(expectedPayload.get(field.name()).equals(payload.get(field.name())));
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.testng.annotations.Test)

Example 88 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class MetadataConverterWrapperTest method testConvertsMetadataMultiOutput.

@Test
public void testConvertsMetadataMultiOutput() throws DataConversionException {
    final int numRecordsToReturn = 2;
    DummyConverter dummyConverter = new DummyConverter(numRecordsToReturn);
    MetadataConverterWrapper<String, String, String, String> wrapper = new MetadataConverterWrapper<>(dummyConverter);
    Iterable<RecordWithMetadata<String>> records1 = wrapper.convertRecord("foo", new RecordWithMetadata<String>("bar", buildMetadata(1)), new WorkUnitState());
    Iterable<RecordWithMetadata<String>> records2 = wrapper.convertRecord("foo", new RecordWithMetadata<String>("baz", buildMetadata(2)), new WorkUnitState());
    Iterator<RecordWithMetadata<String>> record1It = records1.iterator();
    Iterator<RecordWithMetadata<String>> record2It = records2.iterator();
    for (int i = 0; i < numRecordsToReturn; i++) {
        RecordWithMetadata<String> record1 = record1It.next();
        Assert.assertEquals(record1.getRecord(), "converted" + String.valueOf(i));
        Assert.assertEquals(record1.getMetadata().getGlobalMetadata().getDatasetUrn(), "dataset-id:1");
        RecordWithMetadata<String> record2 = record2It.next();
        Assert.assertEquals(record2.getRecord(), "converted" + String.valueOf(i));
        Assert.assertEquals(record2.getMetadata().getGlobalMetadata().getDatasetUrn(), "dataset-id:2");
    }
}
Also used : RecordWithMetadata(org.apache.gobblin.type.RecordWithMetadata) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Test(org.testng.annotations.Test)

Example 89 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class AvroToJdbcEntryConverterTest method testFlattening.

@Test
public void testFlattening() throws IOException, SchemaConversionException, SQLException, URISyntaxException, DataConversionException {
    final String db = "db";
    final String table = "users";
    Map<String, JdbcType> dateColums = new HashMap<>();
    dateColums.put("date_of_birth", JdbcType.DATE);
    dateColums.put("last_modified", JdbcType.TIME);
    dateColums.put("created", JdbcType.TIMESTAMP);
    JdbcWriterCommands mockWriterCommands = mock(JdbcWriterCommands.class);
    when(mockWriterCommands.retrieveDateColumns(db, table)).thenReturn(dateColums);
    JdbcWriterCommandsFactory factory = mock(JdbcWriterCommandsFactory.class);
    when(factory.newInstance(any(State.class), any(Connection.class))).thenReturn(mockWriterCommands);
    List<JdbcEntryMetaDatum> jdbcEntryMetaData = new ArrayList<>();
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("name", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_number", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_color", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("date_of_birth", JdbcType.DATE));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("last_modified", JdbcType.TIME));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("created", JdbcType.TIMESTAMP));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested1_string", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested1_int", JdbcType.INTEGER));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested2_union_nested2_string", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested2_union_nested2_int", JdbcType.INTEGER));
    JdbcEntrySchema expected = new JdbcEntrySchema(jdbcEntryMetaData);
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/pickfields_nested_with_union.avsc"));
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.appendToListProp(JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME, table);
    AvroToJdbcEntryConverter converter = new AvroToJdbcEntryConverter(workUnitState);
    Map<String, JdbcType> dateColumnMapping = Maps.newHashMap();
    dateColumnMapping.put("date_of_birth", JdbcType.DATE);
    dateColumnMapping.put("last_modified", JdbcType.TIME);
    dateColumnMapping.put("created", JdbcType.TIMESTAMP);
    workUnitState.appendToListProp(AvroToJdbcEntryConverter.CONVERTER_AVRO_JDBC_DATE_FIELDS, new Gson().toJson(dateColumnMapping));
    JdbcEntrySchema actualSchema = converter.convertSchema(inputSchema, workUnitState);
    Assert.assertEquals(expected, actualSchema);
    try (DataFileReader<GenericRecord> srcDataFileReader = new DataFileReader<GenericRecord>(new File(getClass().getResource("/converter/pickfields_nested_with_union.avro").toURI()), new GenericDatumReader<GenericRecord>(inputSchema))) {
        List<JdbcEntryData> entries = new ArrayList<>();
        while (srcDataFileReader.hasNext()) {
            JdbcEntryData actualData = converter.convertRecord(actualSchema, srcDataFileReader.next(), workUnitState).iterator().next();
            entries.add(actualData);
        }
        final JsonSerializer<JdbcEntryDatum> datumSer = new JsonSerializer<JdbcEntryDatum>() {

            @Override
            public JsonElement serialize(JdbcEntryDatum datum, Type typeOfSrc, JsonSerializationContext context) {
                JsonObject jso = new JsonObject();
                if (datum.getVal() == null) {
                    jso.add(datum.getColumnName(), null);
                    return jso;
                }
                if (datum.getVal() instanceof Date) {
                    jso.addProperty(datum.getColumnName(), ((Date) datum.getVal()).getTime());
                } else if (datum.getVal() instanceof Timestamp) {
                    jso.addProperty(datum.getColumnName(), ((Timestamp) datum.getVal()).getTime());
                } else if (datum.getVal() instanceof Time) {
                    jso.addProperty(datum.getColumnName(), ((Time) datum.getVal()).getTime());
                } else {
                    jso.addProperty(datum.getColumnName(), datum.getVal().toString());
                }
                return jso;
            }
        };
        JsonSerializer<JdbcEntryData> serializer = new JsonSerializer<JdbcEntryData>() {

            @Override
            public JsonElement serialize(JdbcEntryData src, Type typeOfSrc, JsonSerializationContext context) {
                JsonArray arr = new JsonArray();
                for (JdbcEntryDatum datum : src) {
                    arr.add(datumSer.serialize(datum, datum.getClass(), context));
                }
                return arr;
            }
        };
        Gson gson = new GsonBuilder().registerTypeAdapter(JdbcEntryData.class, serializer).serializeNulls().create();
        JsonElement actualSerialized = gson.toJsonTree(entries);
        JsonElement expectedSerialized = new JsonParser().parse(new InputStreamReader(getClass().getResourceAsStream("/converter/pickfields_nested_with_union.json")));
        Assert.assertEquals(actualSerialized, expectedSerialized);
    }
    converter.close();
}
Also used : HashMap(java.util.HashMap) JdbcWriterCommands(org.apache.gobblin.writer.commands.JdbcWriterCommands) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) JsonObject(com.google.gson.JsonObject) Time(java.sql.Time) JsonSerializer(com.google.gson.JsonSerializer) Timestamp(java.sql.Timestamp) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) JdbcWriterCommandsFactory(org.apache.gobblin.writer.commands.JdbcWriterCommandsFactory) JsonParser(com.google.gson.JsonParser) InputStreamReader(java.io.InputStreamReader) GsonBuilder(com.google.gson.GsonBuilder) Connection(java.sql.Connection) Date(java.sql.Date) JsonArray(com.google.gson.JsonArray) DestinationType(org.apache.gobblin.writer.Destination.DestinationType) Type(java.lang.reflect.Type) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) JsonElement(com.google.gson.JsonElement) JsonSerializationContext(com.google.gson.JsonSerializationContext) File(java.io.File) Test(org.testng.annotations.Test)

Example 90 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class AvroToJdbcEntryConverterTest method testDateConversion.

@Test
public void testDateConversion() throws IOException, SchemaConversionException, SQLException {
    final String db = "db";
    final String table = "users";
    Map<String, JdbcType> dateColums = new HashMap<>();
    dateColums.put("date_of_birth", JdbcType.DATE);
    dateColums.put("last_modified", JdbcType.TIME);
    dateColums.put("created", JdbcType.TIMESTAMP);
    JdbcWriterCommands mockWriterCommands = mock(JdbcWriterCommands.class);
    when(mockWriterCommands.retrieveDateColumns(db, table)).thenReturn(dateColums);
    JdbcWriterCommandsFactory factory = mock(JdbcWriterCommandsFactory.class);
    when(factory.newInstance(any(State.class), any(Connection.class))).thenReturn(mockWriterCommands);
    List<JdbcEntryMetaDatum> jdbcEntryMetaData = new ArrayList<>();
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("name", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_number", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_color", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("date_of_birth", JdbcType.DATE));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("last_modified", JdbcType.TIME));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("created", JdbcType.TIMESTAMP));
    JdbcEntrySchema expected = new JdbcEntrySchema(jdbcEntryMetaData);
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/fieldPickInput.avsc"));
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.appendToListProp(JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME, table);
    AvroToJdbcEntryConverter converter = new AvroToJdbcEntryConverter(workUnitState);
    Map<String, JdbcType> dateColumnMapping = Maps.newHashMap();
    dateColumnMapping.put("date_of_birth", JdbcType.DATE);
    dateColumnMapping.put("last_modified", JdbcType.TIME);
    dateColumnMapping.put("created", JdbcType.TIMESTAMP);
    workUnitState.appendToListProp(AvroToJdbcEntryConverter.CONVERTER_AVRO_JDBC_DATE_FIELDS, new Gson().toJson(dateColumnMapping));
    JdbcEntrySchema actual = converter.convertSchema(inputSchema, workUnitState);
    Assert.assertEquals(expected, actual);
}
Also used : HashMap(java.util.HashMap) JdbcWriterCommands(org.apache.gobblin.writer.commands.JdbcWriterCommands) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) Connection(java.sql.Connection) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) JdbcWriterCommandsFactory(org.apache.gobblin.writer.commands.JdbcWriterCommandsFactory) Test(org.testng.annotations.Test)

Aggregations

WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)222 Test (org.testng.annotations.Test)143 State (org.apache.gobblin.configuration.State)48 SourceState (org.apache.gobblin.configuration.SourceState)39 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)39 Schema (org.apache.avro.Schema)29 Path (org.apache.hadoop.fs.Path)26 GenericRecord (org.apache.avro.generic.GenericRecord)19 JsonObject (com.google.gson.JsonObject)17 ArrayList (java.util.ArrayList)16 File (java.io.File)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)12 List (java.util.List)11 Configuration (org.apache.hadoop.conf.Configuration)11 IOException (java.io.IOException)10 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)10 Extract (org.apache.gobblin.source.workunit.Extract)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Closer (com.google.common.io.Closer)8 JsonParser (com.google.gson.JsonParser)8