use of org.apache.avro.generic.GenericDatumReader in project beam by apache.
the class AvroIOTest method testAvroIOCompressedWriteAndReadASingleFile.
@Test
@SuppressWarnings("unchecked")
@Category(NeedsRunner.class)
public void testAvroIOCompressedWriteAndReadASingleFile() throws Throwable {
List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar"));
File outputFile = tmpFolder.newFile("output.avro");
p.apply(Create.of(values)).apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding().withCodec(CodecFactory.deflateCodec(9)));
p.run();
PCollection<GenericClass> input = p.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()));
PAssert.that(input).containsInAnyOrder(values);
p.run();
DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader());
assertEquals("deflate", dataFileStream.getMetaString("avro.codec"));
}
use of org.apache.avro.generic.GenericDatumReader in project beam by apache.
the class AvroPipelineTest method readGenericFile.
private List<GenericRecord> readGenericFile() throws IOException {
List<GenericRecord> records = Lists.newArrayList();
GenericDatumReader<GenericRecord> genericDatumReader = new GenericDatumReader<>();
try (DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(outputDir + "-00000-of-00001"), genericDatumReader)) {
for (GenericRecord record : dataFileReader) {
records.add(record);
}
}
return records;
}
use of org.apache.avro.generic.GenericDatumReader in project voldemort by voldemort.
the class ClientConfigUtil method readSingleClientConfigAvro.
/**
* Parses a string that contains single fat client config string in avro
* format
*
* @param configAvro Input string of avro format, that contains config for
* multiple stores
* @return Properties of single fat client config
*/
@SuppressWarnings("unchecked")
public static Properties readSingleClientConfigAvro(String configAvro) {
Properties props = new Properties();
try {
JsonDecoder decoder = new JsonDecoder(CLIENT_CONFIG_AVRO_SCHEMA, configAvro);
GenericDatumReader<Object> datumReader = new GenericDatumReader<Object>(CLIENT_CONFIG_AVRO_SCHEMA);
Map<Utf8, Utf8> flowMap = (Map<Utf8, Utf8>) datumReader.read(null, decoder);
for (Utf8 key : flowMap.keySet()) {
props.put(key.toString(), flowMap.get(key).toString());
}
} catch (Exception e) {
e.printStackTrace();
}
return props;
}
use of org.apache.avro.generic.GenericDatumReader in project flink by apache.
the class AvroRecordInputFormatTest method testDeserializeToGenericType.
/**
* Test if the Flink serialization is able to properly process GenericData.Record types.
* Usually users of Avro generate classes (POJOs) from Avro schemas.
* However, if generated classes are not available, one can also use GenericData.Record.
* It is an untyped key-value record which is using a schema to validate the correctness of the data.
*
* It is not recommended to use GenericData.Record with Flink. Use generated POJOs instead.
*/
@Test
public void testDeserializeToGenericType() throws IOException {
DatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(userSchema);
try (FileReader<GenericData.Record> dataFileReader = DataFileReader.openReader(testFile, datumReader)) {
// initialize Record by reading it from disk (thats easier than creating it by hand)
GenericData.Record rec = new GenericData.Record(userSchema);
dataFileReader.next(rec);
// check if record has been read correctly
assertNotNull(rec);
assertEquals("name not equal", TEST_NAME, rec.get("name").toString());
assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString());
// it is null for the first record.
assertEquals(null, rec.get("type_long_test"));
// now serialize it with our framework:
TypeInformation<GenericData.Record> te = TypeExtractor.createTypeInfo(GenericData.Record.class);
ExecutionConfig ec = new ExecutionConfig();
Assert.assertEquals(GenericTypeInfo.class, te.getClass());
Serializers.recursivelyRegisterType(te.getTypeClass(), ec, new HashSet<Class<?>>());
TypeSerializer<GenericData.Record> tser = te.createSerializer(ec);
Assert.assertEquals(1, ec.getDefaultKryoSerializerClasses().size());
Assert.assertTrue(ec.getDefaultKryoSerializerClasses().containsKey(Schema.class) && ec.getDefaultKryoSerializerClasses().get(Schema.class).equals(Serializers.AvroSchemaSerializer.class));
ByteArrayOutputStream out = new ByteArrayOutputStream();
try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) {
tser.serialize(rec, outView);
}
GenericData.Record newRec;
try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(new ByteArrayInputStream(out.toByteArray()))) {
newRec = tser.deserialize(inView);
}
// check if it is still the same
assertNotNull(newRec);
assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.get("type_enum").toString());
assertEquals("name not equal", TEST_NAME, newRec.get("name").toString());
assertEquals(null, newRec.get("type_long_test"));
}
}
use of org.apache.avro.generic.GenericDatumReader in project hive by apache.
the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.
@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
// Need to verify that when reading a datum with an updated reader schema
// that the datum then returns the reader schema as its own, since we
// depend on this behavior in order to avoid re-encoding the datum
// in the serde.
String v0 = "{\n" + " \"namespace\": \"org.apache.hadoop.hive\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " }\n" + " ]\n" + "}";
String v1 = "{\n" + " \"namespace\": \"org.apache.hadoop.hive\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " },\n" + " {\n" + " \"name\":\"v1\",\n" + " \"type\":\"string\",\n" + " \"default\":\"v1_default\"" + " }\n" + " ]\n" + "}";
Schema[] schemas = { AvroSerdeUtils.getSchemaFor(v0), AvroSerdeUtils.getSchemaFor(v1) };
// Encode a schema with v0, write out.
GenericRecord record = new GenericData.Record(schemas[0]);
record.put("v0", "v0 value");
assertTrue(GenericData.get().validate(schemas[0], record));
// Write datum out to a stream
GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
dfw.create(schemas[0], baos);
dfw.append(record);
dfw.close();
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
gdr.setExpected(schemas[1]);
DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
assertTrue(dfs.hasNext());
GenericRecord next = dfs.next();
assertEquals("v0 value", next.get("v0").toString());
assertEquals("v1_default", next.get("v1").toString());
// Now the most important check - when we query this record for its schema,
// we should get back the latest, reader schema:
assertEquals(schemas[1], next.getSchema());
}
Aggregations