Search in sources :

Example 61 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project beam by apache.

the class AvroPipelineTest method populateGenericFile.

private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
    FileOutputStream outputStream = new FileOutputStream(this.inputFile);
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema);
    try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter)) {
        dataFileWriter.create(schema, outputStream);
        for (GenericRecord record : genericRecords) {
            dataFileWriter.append(record);
        }
    }
    outputStream.close();
}
Also used : FileOutputStream(java.io.FileOutputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 62 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project beam by apache.

the class BigQueryIOWriteTest method testWriteAvroWithCustomWriter.

@Test
public void testWriteAvroWithCustomWriter() throws Exception {
    if (useStorageApi || useStreaming) {
        return;
    }
    SerializableFunction<AvroWriteRequest<InputRecord>, GenericRecord> formatFunction = r -> {
        GenericRecord rec = new GenericData.Record(r.getSchema());
        InputRecord i = r.getElement();
        rec.put("strVal", i.strVal());
        rec.put("longVal", i.longVal());
        rec.put("doubleVal", i.doubleVal());
        rec.put("instantVal", i.instantVal().getMillis() * 1000);
        return rec;
    };
    SerializableFunction<org.apache.avro.Schema, DatumWriter<GenericRecord>> customWriterFactory = s -> new GenericDatumWriter<GenericRecord>() {

        @Override
        protected void writeString(org.apache.avro.Schema schema, Object datum, Encoder out) throws IOException {
            super.writeString(schema, datum.toString() + "_custom", out);
        }
    };
    p.apply(Create.of(InputRecord.create("test", 1, 1.0, Instant.parse("2019-01-01T00:00:00Z")), InputRecord.create("test2", 2, 2.0, Instant.parse("2019-02-01T00:00:00Z"))).withCoder(INPUT_RECORD_CODER)).apply(BigQueryIO.<InputRecord>write().to("dataset-id.table-id").withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("strVal").setType("STRING"), new TableFieldSchema().setName("longVal").setType("INTEGER"), new TableFieldSchema().setName("doubleVal").setType("FLOAT"), new TableFieldSchema().setName("instantVal").setType("TIMESTAMP")))).withTestServices(fakeBqServices).withAvroWriter(formatFunction, customWriterFactory).withoutValidation());
    p.run();
    assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(new TableRow().set("strVal", "test_custom").set("longVal", "1").set("doubleVal", 1.0D).set("instantVal", "2019-01-01 00:00:00 UTC"), new TableRow().set("strVal", "test2_custom").set("longVal", "2").set("doubleVal", 2.0D).set("instantVal", "2019-02-01 00:00:00 UTC")));
}
Also used : ExpectedLogs(org.apache.beam.sdk.testing.ExpectedLogs) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Encoder(org.apache.avro.io.Encoder) ResultCoder(org.apache.beam.sdk.io.gcp.bigquery.WritePartition.ResultCoder) Matcher(java.util.regex.Matcher) DoFnTester(org.apache.beam.sdk.transforms.DoFnTester) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) EnumSet(java.util.EnumSet) ValueProvider(org.apache.beam.sdk.options.ValueProvider) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) KvCoder(org.apache.beam.sdk.coders.KvCoder) Matchers.allOf(org.hamcrest.Matchers.allOf) Set(java.util.Set) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) Serializable(java.io.Serializable) IncompatibleWindowException(org.apache.beam.sdk.transforms.windowing.IncompatibleWindowException) Assert.assertFalse(org.junit.Assert.assertFalse) AutoValue(com.google.auto.value.AutoValue) TestStream(org.apache.beam.sdk.testing.TestStream) Matchers.is(org.hamcrest.Matchers.is) DisplayDataMatchers.hasDisplayItem(org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem) Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method) Preconditions.checkNotNull(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull) KV(org.apache.beam.sdk.values.KV) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) View(org.apache.beam.sdk.transforms.View) ArrayList(java.util.ArrayList) GenericData(org.apache.avro.generic.GenericData) Distinct(org.apache.beam.sdk.transforms.Distinct) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) TupleTag(org.apache.beam.sdk.values.TupleTag) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) StreamSupport(java.util.stream.StreamSupport) JavaFieldSchema(org.apache.beam.sdk.schemas.JavaFieldSchema) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Row(org.apache.beam.sdk.values.Row) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteTables.Result) Before(org.junit.Before) TableReference(com.google.api.services.bigquery.model.TableReference) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Files(java.nio.file.Files) PAssert(org.apache.beam.sdk.testing.PAssert) NonMergingWindowFn(org.apache.beam.sdk.transforms.windowing.NonMergingWindowFn) Parameter(org.junit.runners.Parameterized.Parameter) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) ShardedKeyCoder(org.apache.beam.sdk.coders.ShardedKeyCoder) Test(org.junit.Test) Schema(org.apache.beam.sdk.schemas.Schema) File(java.io.File) Assert.assertNull(org.junit.Assert.assertNull) Paths(java.nio.file.Paths) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) AtomicCoder(org.apache.beam.sdk.coders.AtomicCoder) DefaultSchema(org.apache.beam.sdk.schemas.annotations.DefaultSchema) FakeJobService(org.apache.beam.sdk.io.gcp.testing.FakeJobService) Assert.assertEquals(org.junit.Assert.assertEquals) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) After(org.junit.After) TableRow(com.google.api.services.bigquery.model.TableRow) Assert.fail(org.junit.Assert.fail) TableSchema(com.google.api.services.bigquery.model.TableSchema) ArrayListMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ArrayListMultimap) ShardedKey(org.apache.beam.sdk.values.ShardedKey) Parameterized(org.junit.runners.Parameterized) MapElements(org.apache.beam.sdk.transforms.MapElements) DatumWriter(org.apache.avro.io.DatumWriter) Collection(java.util.Collection) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) Description(org.junit.runner.Description) Collectors(java.util.stream.Collectors) List(java.util.List) Clustering(com.google.api.services.bigquery.model.Clustering) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) TableDataInsertAllResponse(com.google.api.services.bigquery.model.TableDataInsertAllResponse) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pattern(java.util.regex.Pattern) ErrorProto(com.google.api.services.bigquery.model.ErrorProto) Statement(org.junit.runners.model.Statement) TestRule(org.junit.rules.TestRule) Parameters(org.junit.runners.Parameterized.Parameters) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) SerializableFunctions(org.apache.beam.sdk.transforms.SerializableFunctions) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) WindowMappingFn(org.apache.beam.sdk.transforms.windowing.WindowMappingFn) SchemaCreate(org.apache.beam.sdk.schemas.annotations.SchemaCreate) Job(com.google.api.services.bigquery.model.Job) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExpectedException(org.junit.rules.ExpectedException) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) OutputStream(java.io.OutputStream) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) GenericRecord(org.apache.avro.generic.GenericRecord) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) Matchers(org.hamcrest.Matchers) PCollection(org.apache.beam.sdk.values.PCollection) Table(com.google.api.services.bigquery.model.Table) Rule(org.junit.Rule) Instant(org.joda.time.Instant) Collections(java.util.Collections) JobConfigurationLoad(com.google.api.services.bigquery.model.JobConfigurationLoad) TemporaryFolder(org.junit.rules.TemporaryFolder) InputStream(java.io.InputStream) TableSchema(com.google.api.services.bigquery.model.TableSchema) JavaFieldSchema(org.apache.beam.sdk.schemas.JavaFieldSchema) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Schema(org.apache.beam.sdk.schemas.Schema) DefaultSchema(org.apache.beam.sdk.schemas.annotations.DefaultSchema) TableSchema(com.google.api.services.bigquery.model.TableSchema) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericData(org.apache.avro.generic.GenericData) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) DatumWriter(org.apache.avro.io.DatumWriter) Encoder(org.apache.avro.io.Encoder) TableRow(com.google.api.services.bigquery.model.TableRow) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 63 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project beam by apache.

the class BigQueryIOStorageReadTest method createResponse.

private static ReadRowsResponse createResponse(Schema schema, Collection<GenericRecord> genericRecords, double progressAtResponseStart, double progressAtResponseEnd) throws Exception {
    GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<>(schema);
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    Encoder binaryEncoder = ENCODER_FACTORY.binaryEncoder(outputStream, null);
    for (GenericRecord genericRecord : genericRecords) {
        writer.write(genericRecord, binaryEncoder);
    }
    binaryEncoder.flush();
    return ReadRowsResponse.newBuilder().setAvroRows(AvroRows.newBuilder().setSerializedBinaryRows(ByteString.copyFrom(outputStream.toByteArray())).setRowCount(genericRecords.size())).setRowCount(genericRecords.size()).setStats(StreamStats.newBuilder().setProgress(Progress.newBuilder().setAtResponseStart(progressAtResponseStart).setAtResponseEnd(progressAtResponseEnd))).build();
}
Also used : Encoder(org.apache.avro.io.Encoder) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 64 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project beam by apache.

the class BigQueryIOStorageQueryTest method createResponse.

private static ReadRowsResponse createResponse(Schema schema, Collection<GenericRecord> genericRecords, double progressAtResponseStart, double progressAtResponseEnd) throws Exception {
    GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<>(schema);
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    Encoder binaryEncoder = ENCODER_FACTORY.binaryEncoder(outputStream, null);
    for (GenericRecord genericRecord : genericRecords) {
        writer.write(genericRecord, binaryEncoder);
    }
    binaryEncoder.flush();
    return ReadRowsResponse.newBuilder().setAvroRows(AvroRows.newBuilder().setSerializedBinaryRows(ByteString.copyFrom(outputStream.toByteArray())).setRowCount(genericRecords.size())).setRowCount(genericRecords.size()).setStats(StreamStats.newBuilder().setProgress(Progress.newBuilder().setAtResponseStart(progressAtResponseStart).setAtResponseEnd(progressAtResponseEnd))).build();
}
Also used : Encoder(org.apache.avro.io.Encoder) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 65 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project haivvreo by jghoman.

the class AvroContainerOutputFormat method getHiveRecordWriter.

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jobConf, Path path, Class<? extends Writable> valueClass, boolean isCompressed, Properties properties, Progressable progressable) throws IOException {
    Schema schema;
    try {
        schema = HaivvreoUtils.determineSchemaOrThrowException(jobConf, properties);
    } catch (HaivvreoException e) {
        throw new IOException(e);
    }
    GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
    if (isCompressed) {
        int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName);
        dfw.setCodec(factory);
    }
    dfw.create(schema, path.getFileSystem(jobConf).create(path));
    return new AvroGenericRecordWriter(dfw);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) IOException(java.io.IOException) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) CodecFactory(org.apache.avro.file.CodecFactory) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)127 GenericRecord (org.apache.avro.generic.GenericRecord)105 Schema (org.apache.avro.Schema)69 ByteArrayOutputStream (java.io.ByteArrayOutputStream)57 DataFileWriter (org.apache.avro.file.DataFileWriter)47 File (java.io.File)40 Test (org.junit.Test)37 IOException (java.io.IOException)29 BinaryEncoder (org.apache.avro.io.BinaryEncoder)29 MockFlowFile (org.apache.nifi.util.MockFlowFile)25 Encoder (org.apache.avro.io.Encoder)23 TestRunner (org.apache.nifi.util.TestRunner)20 HashMap (java.util.HashMap)14 ByteArrayOutputStream (org.apache.nifi.stream.io.ByteArrayOutputStream)14 GenericData (org.apache.avro.generic.GenericData)12 ByteArrayInputStream (java.io.ByteArrayInputStream)11 FileOutputStream (java.io.FileOutputStream)10 InputStream (java.io.InputStream)9 ArrayList (java.util.ArrayList)8 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)8