Search in sources :

Example 81 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project beam by apache.

the class AvroSourceTest method testSchemaStringIsInterned.

@Test
public void testSchemaStringIsInterned() throws Exception {
    List<Bird> birds = createRandomRecords(100);
    String filename = generateTestFile("tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC);
    Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
    String schemaA = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
    String schemaB = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
    assertNotSame(schemaA, schemaB);
    AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema(schemaA);
    AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema(schemaB);
    assertSame(sourceA.getSchema(), sourceB.getSchema());
    // Ensure that deserialization still goes through interning
    AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB);
    assertSame(sourceA.getSchema(), sourceC.getSchema());
}
Also used : Metadata(org.apache.beam.sdk.io.fs.MatchResult.Metadata) AvroMetadata(org.apache.beam.sdk.io.AvroSource.AvroMetadata) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 82 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project beam by apache.

the class FakeJobService method writeRowsHelper.

private void writeRowsHelper(List<TableRow> rows, Schema avroSchema, String destinationPattern, int shard) throws IOException {
    String filename = destinationPattern.replace("*", String.format("%012d", shard));
    try (WritableByteChannel channel = FileSystems.create(FileSystems.matchNewResource(filename, false), MimeTypes.BINARY);
        DataFileWriter<GenericRecord> tableRowWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(avroSchema)).create(avroSchema, Channels.newOutputStream(channel))) {
        for (Map<String, Object> record : rows) {
            GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(avroSchema);
            for (Map.Entry<String, Object> field : record.entrySet()) {
                genericRecordBuilder.set(field.getKey(), field.getValue());
            }
            tableRowWriter.append(genericRecordBuilder.build());
        }
    } catch (IOException e) {
        throw new IllegalStateException(String.format("Could not create destination for extract job %s", filename), e);
    }
}
Also used : WritableByteChannel(java.nio.channels.WritableByteChannel) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) IOException(java.io.IOException) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) Map(java.util.Map)

Example 83 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project beam by apache.

the class BigQueryAvroUtilsTest method testConvertGenericRecordToTableRow.

@Test
public void testConvertGenericRecordToTableRow() throws Exception {
    TableSchema tableSchema = new TableSchema();
    tableSchema.setFields(fields);
    Schema avroSchema = AvroCoder.of(Bird.class).getSchema();
    {
        // Test nullable fields.
        GenericRecord record = new GenericData.Record(avroSchema);
        record.put("number", 5L);
        TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
        TableRow row = new TableRow().set("number", "5").set("associates", new ArrayList<TableRow>());
        assertEquals(row, convertedRow);
    }
    {
        // Test type conversion for:
        // INTEGER, FLOAT, TIMESTAMP, BOOLEAN, BYTES, DATE, DATETIME, TIME.
        GenericRecord record = new GenericData.Record(avroSchema);
        byte[] soundBytes = "chirp,chirp".getBytes();
        ByteBuffer soundByteBuffer = ByteBuffer.wrap(soundBytes);
        soundByteBuffer.rewind();
        record.put("number", 5L);
        record.put("quality", 5.0);
        record.put("birthday", 5L);
        record.put("flighted", Boolean.TRUE);
        record.put("sound", soundByteBuffer);
        record.put("anniversaryDate", new Utf8("2000-01-01"));
        record.put("anniversaryDatetime", new String("2000-01-01 00:00:00.000005"));
        record.put("anniversaryTime", new Utf8("00:00:00.000005"));
        TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
        TableRow row = new TableRow().set("number", "5").set("birthday", "1970-01-01 00:00:00.000005 UTC").set("quality", 5.0).set("associates", new ArrayList<TableRow>()).set("flighted", Boolean.TRUE).set("sound", BaseEncoding.base64().encode(soundBytes)).set("anniversaryDate", "2000-01-01").set("anniversaryDatetime", "2000-01-01 00:00:00.000005").set("anniversaryTime", "00:00:00.000005");
        assertEquals(row, convertedRow);
    }
    {
        // Test repeated fields.
        Schema subBirdSchema = AvroCoder.of(Bird.SubBird.class).getSchema();
        GenericRecord nestedRecord = new GenericData.Record(subBirdSchema);
        nestedRecord.put("species", "other");
        GenericRecord record = new GenericData.Record(avroSchema);
        record.put("number", 5L);
        record.put("associates", Lists.<GenericRecord>newArrayList(nestedRecord));
        TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
        TableRow row = new TableRow().set("associates", Lists.<TableRow>newArrayList(new TableRow().set("species", "other"))).set("number", "5");
        assertEquals(row, convertedRow);
    }
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Schema(org.apache.avro.Schema) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) ArrayList(java.util.ArrayList) Utf8(org.apache.avro.util.Utf8) GenericRecord(org.apache.avro.generic.GenericRecord) GenericData(org.apache.avro.generic.GenericData) ByteBuffer(java.nio.ByteBuffer) Test(org.junit.Test)

Example 84 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project beam by apache.

the class AvroPipelineTest method readGenericFile.

private List<GenericRecord> readGenericFile() throws IOException {
    List<GenericRecord> records = Lists.newArrayList();
    GenericDatumReader<GenericRecord> genericDatumReader = new GenericDatumReader<>();
    try (DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(outputDir + "-00000-of-00001"), genericDatumReader)) {
        for (GenericRecord record : dataFileReader) {
            records.add(record);
        }
    }
    return records;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 85 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project beam by apache.

the class AvroPipelineTest method populateGenericFile.

private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
    FileOutputStream outputStream = new FileOutputStream(this.inputFile);
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema);
    try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter)) {
        dataFileWriter.create(schema, outputStream);
        for (GenericRecord record : genericRecords) {
            dataFileWriter.append(record);
        }
    }
    outputStream.close();
}
Also used : FileOutputStream(java.io.FileOutputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

GenericRecord (org.apache.avro.generic.GenericRecord)262 Schema (org.apache.avro.Schema)101 Test (org.junit.Test)80 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)46 File (java.io.File)35 IOException (java.io.IOException)34 GenericData (org.apache.avro.generic.GenericData)30 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)30 ArrayList (java.util.ArrayList)29 ByteArrayOutputStream (java.io.ByteArrayOutputStream)27 DataFileWriter (org.apache.avro.file.DataFileWriter)20 HashMap (java.util.HashMap)19 ByteBuffer (java.nio.ByteBuffer)18 BinaryEncoder (org.apache.avro.io.BinaryEncoder)17 Field (org.apache.avro.Schema.Field)14 DataFileStream (org.apache.avro.file.DataFileStream)14 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)14 Utf8 (org.apache.avro.util.Utf8)14 Encoder (org.apache.avro.io.Encoder)12 DatasetRepository (com.cloudera.cdk.data.DatasetRepository)11