use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.
the class EnvelopePayloadConverterTest method testConverter.
@Test
public void testConverter() throws IOException, DataConversionException, SchemaRegistryException {
Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/envelope.avsc"));
GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(inputSchema);
File tmp = File.createTempFile(getClass().getSimpleName(), null);
FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/envelope.avro"), tmp);
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(tmp, datumReader);
GenericRecord inputRecord = dataFileReader.next();
Schema latestPayloadSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/record.avsc"));
when(mockRegistry.getLatestSchemaByTopic(any())).thenReturn(latestPayloadSchema);
when(mockRegistry.getSchemaByKey(any())).thenReturn(inputSchema.getField("nestedRecord").schema());
WorkUnitState workUnitState = new WorkUnitState();
workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_TOPIC, "test");
workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_ID_FIELD, "metadata.payloadSchemaId");
workUnitState.setProp(BaseEnvelopeSchemaConverter.KAFKA_REGISTRY_FACTORY, MockKafkaAvroSchemaRegistryFactory.class.getName());
EnvelopePayloadConverter converter = new EnvelopePayloadConverter();
converter.init(workUnitState);
Schema outputSchema = converter.convertSchema(inputSchema, workUnitState);
List<GenericRecord> outputRecords = new ArrayList<>();
Iterables.addAll(outputRecords, converter.convertRecord(outputSchema, inputRecord, workUnitState));
Assert.assertTrue(outputRecords.size() == 1);
GenericRecord outputRecord = outputRecords.get(0);
GenericRecord payload = (GenericRecord) outputRecord.get("payload");
// While making the test envelope avro record, its nestedRecord was intentionally set to the deserialized payload
GenericRecord expectedPayload = (GenericRecord) outputRecord.get("nestedRecord");
Schema payloadSchema = payload.getSchema();
Schema expectedPayloadSchema = expectedPayload.getSchema();
// The expected payload schema has the same number of fields as payload schema but in different order
Assert.assertTrue(expectedPayloadSchema.getName().equals(payloadSchema.getName()));
Assert.assertTrue(expectedPayloadSchema.getNamespace().equals(payloadSchema.getNamespace()));
Assert.assertTrue(expectedPayloadSchema.getFields().size() == payloadSchema.getFields().size());
for (Schema.Field field : payload.getSchema().getFields()) {
Assert.assertTrue(expectedPayload.get(field.name()).equals(payload.get(field.name())));
}
}
use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.
the class EnvelopePayloadExtractingConverterTest method testConverter.
@Test
public void testConverter() throws Exception {
Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/envelope.avsc"));
GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(inputSchema);
File tmp = File.createTempFile(getClass().getSimpleName(), null);
FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/envelope.avro"), tmp);
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(tmp, datumReader);
GenericRecord inputRecord = dataFileReader.next();
Schema latestPayloadSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/record.avsc"));
when(mockRegistry.getLatestSchemaByTopic(any())).thenReturn(latestPayloadSchema);
when(mockRegistry.getSchemaByKey(any())).thenReturn(inputSchema.getField("nestedRecord").schema());
WorkUnitState workUnitState = new WorkUnitState();
workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_TOPIC, "test");
workUnitState.setProp(BaseEnvelopeSchemaConverter.PAYLOAD_SCHEMA_ID_FIELD, "metadata.payloadSchemaId");
workUnitState.setProp(BaseEnvelopeSchemaConverter.KAFKA_REGISTRY_FACTORY, EnvelopePayloadExtractingConverterTest.MockKafkaAvroSchemaRegistryFactory.class.getName());
EnvelopePayloadExtractingConverter converter = new EnvelopePayloadExtractingConverter();
converter.init(workUnitState);
Schema outputSchema = converter.convertSchema(inputSchema, workUnitState);
Assert.assertTrue(outputSchema.equals(latestPayloadSchema));
List<GenericRecord> outputRecords = new ArrayList<>();
Iterables.addAll(outputRecords, converter.convertRecord(outputSchema, inputRecord, workUnitState));
Assert.assertTrue(outputRecords.size() == 1);
GenericRecord payload = outputRecords.get(0);
// While making the test envelope avro input record, its nestedRecord was intentionally set to the deserialized payload
GenericRecord expectedPayload = (GenericRecord) inputRecord.get("nestedRecord");
Schema payloadSchema = payload.getSchema();
Schema expectedPayloadSchema = expectedPayload.getSchema();
// The expected payload schema has the same number of fields as payload schema but in different order
Assert.assertTrue(expectedPayloadSchema.getName().equals(payloadSchema.getName()));
Assert.assertTrue(expectedPayloadSchema.getNamespace().equals(payloadSchema.getNamespace()));
Assert.assertTrue(expectedPayloadSchema.getFields().size() == payloadSchema.getFields().size());
for (Schema.Field field : payload.getSchema().getFields()) {
Assert.assertTrue(expectedPayload.get(field.name()).equals(payload.get(field.name())));
}
}
use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.
the class AvroToJdbcEntryConverterTest method testFlattening.
@Test
public void testFlattening() throws IOException, SchemaConversionException, SQLException, URISyntaxException, DataConversionException {
final String db = "db";
final String table = "users";
Map<String, JdbcType> dateColums = new HashMap<>();
dateColums.put("date_of_birth", JdbcType.DATE);
dateColums.put("last_modified", JdbcType.TIME);
dateColums.put("created", JdbcType.TIMESTAMP);
JdbcWriterCommands mockWriterCommands = mock(JdbcWriterCommands.class);
when(mockWriterCommands.retrieveDateColumns(db, table)).thenReturn(dateColums);
JdbcWriterCommandsFactory factory = mock(JdbcWriterCommandsFactory.class);
when(factory.newInstance(any(State.class), any(Connection.class))).thenReturn(mockWriterCommands);
List<JdbcEntryMetaDatum> jdbcEntryMetaData = new ArrayList<>();
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("name", JdbcType.VARCHAR));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_number", JdbcType.VARCHAR));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("favorite_color", JdbcType.VARCHAR));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("date_of_birth", JdbcType.DATE));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("last_modified", JdbcType.TIME));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("created", JdbcType.TIMESTAMP));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested1_string", JdbcType.VARCHAR));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested1_int", JdbcType.INTEGER));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested2_union_nested2_string", JdbcType.VARCHAR));
jdbcEntryMetaData.add(new JdbcEntryMetaDatum("nested1_nested2_union_nested2_int", JdbcType.INTEGER));
JdbcEntrySchema expected = new JdbcEntrySchema(jdbcEntryMetaData);
Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/pickfields_nested_with_union.avsc"));
WorkUnitState workUnitState = new WorkUnitState();
workUnitState.appendToListProp(JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME, table);
AvroToJdbcEntryConverter converter = new AvroToJdbcEntryConverter(workUnitState);
Map<String, JdbcType> dateColumnMapping = Maps.newHashMap();
dateColumnMapping.put("date_of_birth", JdbcType.DATE);
dateColumnMapping.put("last_modified", JdbcType.TIME);
dateColumnMapping.put("created", JdbcType.TIMESTAMP);
workUnitState.appendToListProp(AvroToJdbcEntryConverter.CONVERTER_AVRO_JDBC_DATE_FIELDS, new Gson().toJson(dateColumnMapping));
JdbcEntrySchema actualSchema = converter.convertSchema(inputSchema, workUnitState);
Assert.assertEquals(expected, actualSchema);
try (DataFileReader<GenericRecord> srcDataFileReader = new DataFileReader<GenericRecord>(new File(getClass().getResource("/converter/pickfields_nested_with_union.avro").toURI()), new GenericDatumReader<GenericRecord>(inputSchema))) {
List<JdbcEntryData> entries = new ArrayList<>();
while (srcDataFileReader.hasNext()) {
JdbcEntryData actualData = converter.convertRecord(actualSchema, srcDataFileReader.next(), workUnitState).iterator().next();
entries.add(actualData);
}
final JsonSerializer<JdbcEntryDatum> datumSer = new JsonSerializer<JdbcEntryDatum>() {
@Override
public JsonElement serialize(JdbcEntryDatum datum, Type typeOfSrc, JsonSerializationContext context) {
JsonObject jso = new JsonObject();
if (datum.getVal() == null) {
jso.add(datum.getColumnName(), null);
return jso;
}
if (datum.getVal() instanceof Date) {
jso.addProperty(datum.getColumnName(), ((Date) datum.getVal()).getTime());
} else if (datum.getVal() instanceof Timestamp) {
jso.addProperty(datum.getColumnName(), ((Timestamp) datum.getVal()).getTime());
} else if (datum.getVal() instanceof Time) {
jso.addProperty(datum.getColumnName(), ((Time) datum.getVal()).getTime());
} else {
jso.addProperty(datum.getColumnName(), datum.getVal().toString());
}
return jso;
}
};
JsonSerializer<JdbcEntryData> serializer = new JsonSerializer<JdbcEntryData>() {
@Override
public JsonElement serialize(JdbcEntryData src, Type typeOfSrc, JsonSerializationContext context) {
JsonArray arr = new JsonArray();
for (JdbcEntryDatum datum : src) {
arr.add(datumSer.serialize(datum, datum.getClass(), context));
}
return arr;
}
};
Gson gson = new GsonBuilder().registerTypeAdapter(JdbcEntryData.class, serializer).serializeNulls().create();
JsonElement actualSerialized = gson.toJsonTree(entries);
JsonElement expectedSerialized = new JsonParser().parse(new InputStreamReader(getClass().getResourceAsStream("/converter/pickfields_nested_with_union.json")));
Assert.assertEquals(actualSerialized, expectedSerialized);
}
converter.close();
}
use of org.apache.avro.file.DataFileReader in project parquet-mr by apache.
the class TestStringBehavior method testReflect.
@Test
public void testReflect() throws IOException {
Schema reflectSchema = ReflectData.get().getSchema(ReflectRecord.class);
ReflectRecord avroRecord;
try (DataFileReader<ReflectRecord> avro = new DataFileReader<>(avroFile, new ReflectDatumReader<>(reflectSchema))) {
avroRecord = avro.next();
}
ReflectRecord parquetRecord;
Configuration conf = new Configuration();
conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
AvroReadSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);
AvroReadSupport.setAvroReadSchema(conf, reflectSchema);
try (ParquetReader<ReflectRecord> parquet = AvroParquetReader.<ReflectRecord>builder(parquetFile).withConf(conf).build()) {
parquetRecord = parquet.read();
}
Assert.assertEquals("Avro default string class should be String", String.class, avroRecord.default_class.getClass());
Assert.assertEquals("Parquet default string class should be String", String.class, parquetRecord.default_class.getClass());
Assert.assertEquals("Avro avro.java.string=String class should be String", String.class, avroRecord.string_class.getClass());
Assert.assertEquals("Parquet avro.java.string=String class should be String", String.class, parquetRecord.string_class.getClass());
Assert.assertEquals("Avro stringable class should be BigDecimal", BigDecimal.class, avroRecord.stringable_class.getClass());
Assert.assertEquals("Parquet stringable class should be BigDecimal", BigDecimal.class, parquetRecord.stringable_class.getClass());
Assert.assertEquals("Should have the correct BigDecimal value", BIG_DECIMAL, parquetRecord.stringable_class);
Assert.assertEquals("Avro map default string class should be String", String.class, keyClass(avroRecord.default_map));
Assert.assertEquals("Parquet map default string class should be String", String.class, keyClass(parquetRecord.default_map));
Assert.assertEquals("Avro map avro.java.string=String class should be String", String.class, keyClass(avroRecord.string_map));
Assert.assertEquals("Parquet map avro.java.string=String class should be String", String.class, keyClass(parquetRecord.string_map));
Assert.assertEquals("Avro map stringable class should be BigDecimal", BigDecimal.class, keyClass(avroRecord.stringable_map));
Assert.assertEquals("Parquet map stringable class should be BigDecimal", BigDecimal.class, keyClass(parquetRecord.stringable_map));
}
use of org.apache.avro.file.DataFileReader in project parquet-mr by apache.
the class TestStringBehavior method testReflectJavaClass.
@Test
public void testReflectJavaClass() throws IOException {
Schema reflectSchema = ReflectData.get().getSchema(ReflectRecordJavaClass.class);
System.err.println("Schema: " + reflectSchema.toString(true));
ReflectRecordJavaClass avroRecord;
try (DataFileReader<ReflectRecordJavaClass> avro = new DataFileReader<>(avroFile, new ReflectDatumReader<>(reflectSchema))) {
avroRecord = avro.next();
}
ReflectRecordJavaClass parquetRecord;
Configuration conf = new Configuration();
conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
AvroReadSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);
AvroReadSupport.setAvroReadSchema(conf, reflectSchema);
AvroReadSupport.setRequestedProjection(conf, reflectSchema);
try (ParquetReader<ReflectRecordJavaClass> parquet = AvroParquetReader.<ReflectRecordJavaClass>builder(parquetFile).withConf(conf).build()) {
parquetRecord = parquet.read();
}
// Avro uses String even if CharSequence is set
Assert.assertEquals("Avro default string class should be String", String.class, avroRecord.default_class.getClass());
Assert.assertEquals("Parquet default string class should be String", String.class, parquetRecord.default_class.getClass());
Assert.assertEquals("Avro stringable class should be BigDecimal", BigDecimal.class, avroRecord.stringable_class.getClass());
Assert.assertEquals("Parquet stringable class should be BigDecimal", BigDecimal.class, parquetRecord.stringable_class.getClass());
Assert.assertEquals("Should have the correct BigDecimal value", BIG_DECIMAL, parquetRecord.stringable_class);
}
Aggregations