Search in sources :

Example 21 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class EnvelopePayloadConverterTest method testConverter.

@Test
public void testConverter() throws IOException, DataConversionException, SchemaRegistryException {
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/envelope.avsc"));
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(inputSchema);
    File tmp = File.createTempFile(getClass().getSimpleName(), null);
    FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/envelope.avro"), tmp);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(tmp, datumReader);
    GenericRecord inputRecord = dataFileReader.next();
    Schema latestPayloadSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/record.avsc"));
    when(mockRegistry.getLatestSchemaByTopic(any())).thenReturn(latestPayloadSchema);
    when(mockRegistry.getSchemaByKey(any())).thenReturn(inputSchema.getField("nestedRecord").schema());
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_TOPIC, "test");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_ID_FIELD, "metadata.payloadSchemaId");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.KAFKA_REGISTRY_FACTORY, MockKafkaAvroSchemaRegistryFactory.class.getName());
    EnvelopePayloadConverter converter = new EnvelopePayloadConverter();
    converter.init(workUnitState);
    Schema outputSchema = converter.convertSchema(inputSchema, workUnitState);
    List<GenericRecord> outputRecords = new ArrayList<>();
    Iterables.addAll(outputRecords, converter.convertRecord(outputSchema, inputRecord, workUnitState));
    Assert.assertTrue(outputRecords.size() == 1);
    GenericRecord outputRecord = outputRecords.get(0);
    GenericRecord payload = (GenericRecord) outputRecord.get("payload");
    // While making the test envelope avro record, its nestedRecord was intentionally set to the deserialized payload
    GenericRecord expectedPayload = (GenericRecord) outputRecord.get("nestedRecord");
    Schema payloadSchema = payload.getSchema();
    Schema expectedPayloadSchema = expectedPayload.getSchema();
    // The expected payload schema has the same number of fields as payload schema but in different order
    Assert.assertTrue(expectedPayloadSchema.getName().equals(payloadSchema.getName()));
    Assert.assertTrue(expectedPayloadSchema.getNamespace().equals(payloadSchema.getNamespace()));
    Assert.assertTrue(expectedPayloadSchema.getFields().size() == payloadSchema.getFields().size());
    for (Schema.Field field : payload.getSchema().getFields()) {
        Assert.assertTrue(expectedPayload.get(field.name()).equals(payload.get(field.name())));
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.testng.annotations.Test)

Example 22 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class EnvelopePayloadExtractingConverterTest method testConverter.

@Test
public void testConverter() throws Exception {
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/envelope.avsc"));
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(inputSchema);
    File tmp = File.createTempFile(getClass().getSimpleName(), null);
    FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/envelope.avro"), tmp);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(tmp, datumReader);
    GenericRecord inputRecord = dataFileReader.next();
    Schema latestPayloadSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/record.avsc"));
    when(mockRegistry.getLatestSchemaByTopic(any())).thenReturn(latestPayloadSchema);
    when(mockRegistry.getSchemaByKey(any())).thenReturn(inputSchema.getField("nestedRecord").schema());
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_TOPIC, "test");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_ID_FIELD, "metadata.payloadSchemaId");
    workUnitState.setProp(BaseEnvelopeSchemaConverter.KAFKA_REGISTRY_FACTORY, EnvelopePayloadExtractingConverterTest.MockKafkaAvroSchemaRegistryFactory.class.getName());
    EnvelopePayloadExtractingConverter converter = new EnvelopePayloadExtractingConverter();
    converter.init(workUnitState);
    Schema outputSchema = converter.convertSchema(inputSchema, workUnitState);
    Assert.assertTrue(outputSchema.equals(latestPayloadSchema));
    List<GenericRecord> outputRecords = new ArrayList<>();
    Iterables.addAll(outputRecords, converter.convertRecord(outputSchema, inputRecord, workUnitState));
    Assert.assertTrue(outputRecords.size() == 1);
    GenericRecord payload = outputRecords.get(0);
    // While making the test envelope avro input record, its nestedRecord was intentionally set to the deserialized payload
    GenericRecord expectedPayload = (GenericRecord) inputRecord.get("nestedRecord");
    Schema payloadSchema = payload.getSchema();
    Schema expectedPayloadSchema = expectedPayload.getSchema();
    // The expected payload schema has the same number of fields as payload schema but in different order
    Assert.assertTrue(expectedPayloadSchema.getName().equals(payloadSchema.getName()));
    Assert.assertTrue(expectedPayloadSchema.getNamespace().equals(payloadSchema.getNamespace()));
    Assert.assertTrue(expectedPayloadSchema.getFields().size() == payloadSchema.getFields().size());
    for (Schema.Field field : payload.getSchema().getFields()) {
        Assert.assertTrue(expectedPayload.get(field.name()).equals(payload.get(field.name())));
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.testng.annotations.Test)

Example 23 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroToJdbcEntryConverterTest method testFlattening.

@Test
public void testFlattening() throws IOException, SchemaConversionException, SQLException, URISyntaxException, DataConversionException {
    final String db = "db";
    final String table = "users";
    Map<String, JdbcType> dateColums = new HashMap<>();
    dateColums.put("date_of_birth", JdbcType.DATE);
    dateColums.put("last_modified", JdbcType.TIME);
    dateColums.put("created", JdbcType.TIMESTAMP);
    JdbcWriterCommands mockWriterCommands = mock(JdbcWriterCommands.class);
    when(mockWriterCommands.retrieveDateColumns(db, table)).thenReturn(dateColums);
    JdbcWriterCommandsFactory factory = mock(JdbcWriterCommandsFactory.class);
    when(factory.newInstance(any(State.class), any(Connection.class))).thenReturn(mockWriterCommands);
    List<JdbcEntryMetaDatum> jdbcEntryMetaData = new ArrayList<>();
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("name", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_number", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_color", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("date_of_birth", JdbcType.DATE));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("last_modified", JdbcType.TIME));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("created", JdbcType.TIMESTAMP));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested1_string", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested1_int", JdbcType.INTEGER));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested2_union_nested2_string", JdbcType.VARCHAR));
    jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested2_union_nested2_int", JdbcType.INTEGER));
    JdbcEntrySchema expected = new JdbcEntrySchema(jdbcEntryMetaData);
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/pickfields_nested_with_union.avsc"));
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.appendToListProp(JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME, table);
    AvroToJdbcEntryConverter converter = new AvroToJdbcEntryConverter(workUnitState);
    Map<String, JdbcType> dateColumnMapping = Maps.newHashMap();
    dateColumnMapping.put("date_of_birth", JdbcType.DATE);
    dateColumnMapping.put("last_modified", JdbcType.TIME);
    dateColumnMapping.put("created", JdbcType.TIMESTAMP);
    workUnitState.appendToListProp(AvroToJdbcEntryConverter.CONVERTER_AVRO_JDBC_DATE_FIELDS, new Gson().toJson(dateColumnMapping));
    JdbcEntrySchema actualSchema = converter.convertSchema(inputSchema, workUnitState);
    Assert.assertEquals(expected, actualSchema);
    try (DataFileReader<GenericRecord> srcDataFileReader = new DataFileReader<GenericRecord>(new File(getClass().getResource("/converter/pickfields_nested_with_union.avro").toURI()), new GenericDatumReader<GenericRecord>(inputSchema))) {
        List<JdbcEntryData> entries = new ArrayList<>();
        while (srcDataFileReader.hasNext()) {
            JdbcEntryData actualData = converter.convertRecord(actualSchema, srcDataFileReader.next(), workUnitState).iterator().next();
            entries.add(actualData);
        }
        final JsonSerializer<JdbcEntryDatum> datumSer = new JsonSerializer<JdbcEntryDatum>() {

            @Override
            public JsonElement serialize(JdbcEntryDatum datum, Type typeOfSrc, JsonSerializationContext context) {
                JsonObject jso = new JsonObject();
                if (datum.getVal() == null) {
                    jso.add(datum.getColumnName(), null);
                    return jso;
                }
                if (datum.getVal() instanceof Date) {
                    jso.addProperty(datum.getColumnName(), ((Date) datum.getVal()).getTime());
                } else if (datum.getVal() instanceof Timestamp) {
                    jso.addProperty(datum.getColumnName(), ((Timestamp) datum.getVal()).getTime());
                } else if (datum.getVal() instanceof Time) {
                    jso.addProperty(datum.getColumnName(), ((Time) datum.getVal()).getTime());
                } else {
                    jso.addProperty(datum.getColumnName(), datum.getVal().toString());
                }
                return jso;
            }
        };
        JsonSerializer<JdbcEntryData> serializer = new JsonSerializer<JdbcEntryData>() {

            @Override
            public JsonElement serialize(JdbcEntryData src, Type typeOfSrc, JsonSerializationContext context) {
                JsonArray arr = new JsonArray();
                for (JdbcEntryDatum datum : src) {
                    arr.add(datumSer.serialize(datum, datum.getClass(), context));
                }
                return arr;
            }
        };
        Gson gson = new GsonBuilder().registerTypeAdapter(JdbcEntryData.class, serializer).serializeNulls().create();
        JsonElement actualSerialized = gson.toJsonTree(entries);
        JsonElement expectedSerialized = new JsonParser().parse(new InputStreamReader(getClass().getResourceAsStream("/converter/pickfields_nested_with_union.json")));
        Assert.assertEquals(actualSerialized, expectedSerialized);
    }
    converter.close();
}
Also used : HashMap(java.util.HashMap) JdbcWriterCommands(org.apache.gobblin.writer.commands.JdbcWriterCommands) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) JsonObject(com.google.gson.JsonObject) Time(java.sql.Time) JsonSerializer(com.google.gson.JsonSerializer) Timestamp(java.sql.Timestamp) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) JdbcWriterCommandsFactory(org.apache.gobblin.writer.commands.JdbcWriterCommandsFactory) JsonParser(com.google.gson.JsonParser) InputStreamReader(java.io.InputStreamReader) GsonBuilder(com.google.gson.GsonBuilder) Connection(java.sql.Connection) Date(java.sql.Date) JsonArray(com.google.gson.JsonArray) DestinationType(org.apache.gobblin.writer.Destination.DestinationType) Type(java.lang.reflect.Type) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) JsonElement(com.google.gson.JsonElement) JsonSerializationContext(com.google.gson.JsonSerializationContext) File(java.io.File) Test(org.testng.annotations.Test)

Example 24 with DataFileReader

use of org.apache.avro.file.DataFileReader in project parquet-mr by apache.

the class TestStringBehavior method testReflect.

@Test
public void testReflect() throws IOException {
    Schema reflectSchema = ReflectData.get().getSchema(ReflectRecord.class);
    ReflectRecord avroRecord;
    try (DataFileReader<ReflectRecord> avro = new DataFileReader<>(avroFile, new ReflectDatumReader<>(reflectSchema))) {
        avroRecord = avro.next();
    }
    ReflectRecord parquetRecord;
    Configuration conf = new Configuration();
    conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
    AvroReadSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);
    AvroReadSupport.setAvroReadSchema(conf, reflectSchema);
    try (ParquetReader<ReflectRecord> parquet = AvroParquetReader.<ReflectRecord>builder(parquetFile).withConf(conf).build()) {
        parquetRecord = parquet.read();
    }
    Assert.assertEquals("Avro default string class should be String", String.class, avroRecord.default_class.getClass());
    Assert.assertEquals("Parquet default string class should be String", String.class, parquetRecord.default_class.getClass());
    Assert.assertEquals("Avro avro.java.string=String class should be String", String.class, avroRecord.string_class.getClass());
    Assert.assertEquals("Parquet avro.java.string=String class should be String", String.class, parquetRecord.string_class.getClass());
    Assert.assertEquals("Avro stringable class should be BigDecimal", BigDecimal.class, avroRecord.stringable_class.getClass());
    Assert.assertEquals("Parquet stringable class should be BigDecimal", BigDecimal.class, parquetRecord.stringable_class.getClass());
    Assert.assertEquals("Should have the correct BigDecimal value", BIG_DECIMAL, parquetRecord.stringable_class);
    Assert.assertEquals("Avro map default string class should be String", String.class, keyClass(avroRecord.default_map));
    Assert.assertEquals("Parquet map default string class should be String", String.class, keyClass(parquetRecord.default_map));
    Assert.assertEquals("Avro map avro.java.string=String class should be String", String.class, keyClass(avroRecord.string_map));
    Assert.assertEquals("Parquet map avro.java.string=String class should be String", String.class, keyClass(parquetRecord.string_map));
    Assert.assertEquals("Avro map stringable class should be BigDecimal", BigDecimal.class, keyClass(avroRecord.stringable_map));
    Assert.assertEquals("Parquet map stringable class should be BigDecimal", BigDecimal.class, keyClass(parquetRecord.stringable_map));
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) AvroSchema(org.apache.avro.reflect.AvroSchema) Test(org.junit.Test)

Example 25 with DataFileReader

use of org.apache.avro.file.DataFileReader in project parquet-mr by apache.

the class TestStringBehavior method testReflectJavaClass.

@Test
public void testReflectJavaClass() throws IOException {
    Schema reflectSchema = ReflectData.get().getSchema(ReflectRecordJavaClass.class);
    System.err.println("Schema: " + reflectSchema.toString(true));
    ReflectRecordJavaClass avroRecord;
    try (DataFileReader<ReflectRecordJavaClass> avro = new DataFileReader<>(avroFile, new ReflectDatumReader<>(reflectSchema))) {
        avroRecord = avro.next();
    }
    ReflectRecordJavaClass parquetRecord;
    Configuration conf = new Configuration();
    conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
    AvroReadSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);
    AvroReadSupport.setAvroReadSchema(conf, reflectSchema);
    AvroReadSupport.setRequestedProjection(conf, reflectSchema);
    try (ParquetReader<ReflectRecordJavaClass> parquet = AvroParquetReader.<ReflectRecordJavaClass>builder(parquetFile).withConf(conf).build()) {
        parquetRecord = parquet.read();
    }
    // Avro uses String even if CharSequence is set
    Assert.assertEquals("Avro default string class should be String", String.class, avroRecord.default_class.getClass());
    Assert.assertEquals("Parquet default string class should be String", String.class, parquetRecord.default_class.getClass());
    Assert.assertEquals("Avro stringable class should be BigDecimal", BigDecimal.class, avroRecord.stringable_class.getClass());
    Assert.assertEquals("Parquet stringable class should be BigDecimal", BigDecimal.class, parquetRecord.stringable_class.getClass());
    Assert.assertEquals("Should have the correct BigDecimal value", BIG_DECIMAL, parquetRecord.stringable_class);
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) AvroSchema(org.apache.avro.reflect.AvroSchema) Test(org.junit.Test)

Aggregations

DataFileReader (org.apache.avro.file.DataFileReader)46 GenericRecord (org.apache.avro.generic.GenericRecord)28 File (java.io.File)26 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)21 Schema (org.apache.avro.Schema)20 Test (org.junit.Test)10 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)8 Test (org.testng.annotations.Test)7 SeekableInput (org.apache.avro.file.SeekableInput)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)6 Configuration (org.apache.hadoop.conf.Configuration)6 ReflectDatumReader (org.apache.avro.reflect.ReflectDatumReader)5 SeekableByteArrayInput (org.apache.avro.file.SeekableByteArrayInput)4 FsInput (org.apache.avro.mapred.FsInput)4 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)4 Utf8 (org.apache.avro.util.Utf8)4 JsonObject (com.google.gson.JsonObject)2 AvroDag (edu.snu.mist.formats.avro.AvroDag)2 Date (java.sql.Date)2