Search in sources :

Example 1 with DatumWriter

use of org.apache.avro.io.DatumWriter in project beam by apache.

the class BigQueryIOWriteTest method testWriteAvroWithCustomWriter.

@Test
public void testWriteAvroWithCustomWriter() throws Exception {
    if (useStorageApi || useStreaming) {
        return;
    }
    SerializableFunction<AvroWriteRequest<InputRecord>, GenericRecord> formatFunction = r -> {
        GenericRecord rec = new GenericData.Record(r.getSchema());
        InputRecord i = r.getElement();
        rec.put("strVal", i.strVal());
        rec.put("longVal", i.longVal());
        rec.put("doubleVal", i.doubleVal());
        rec.put("instantVal", i.instantVal().getMillis() * 1000);
        return rec;
    };
    SerializableFunction<org.apache.avro.Schema, DatumWriter<GenericRecord>> customWriterFactory = s -> new GenericDatumWriter<GenericRecord>() {

        @Override
        protected void writeString(org.apache.avro.Schema schema, Object datum, Encoder out) throws IOException {
            super.writeString(schema, datum.toString() + "_custom", out);
        }
    };
    p.apply(Create.of(InputRecord.create("test", 1, 1.0, Instant.parse("2019-01-01T00:00:00Z")), InputRecord.create("test2", 2, 2.0, Instant.parse("2019-02-01T00:00:00Z"))).withCoder(INPUT_RECORD_CODER)).apply(BigQueryIO.<InputRecord>write().to("dataset-id.table-id").withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("strVal").setType("STRING"), new TableFieldSchema().setName("longVal").setType("INTEGER"), new TableFieldSchema().setName("doubleVal").setType("FLOAT"), new TableFieldSchema().setName("instantVal").setType("TIMESTAMP")))).withTestServices(fakeBqServices).withAvroWriter(formatFunction, customWriterFactory).withoutValidation());
    p.run();
    assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(new TableRow().set("strVal", "test_custom").set("longVal", "1").set("doubleVal", 1.0D).set("instantVal", "2019-01-01 00:00:00 UTC"), new TableRow().set("strVal", "test2_custom").set("longVal", "2").set("doubleVal", 2.0D).set("instantVal", "2019-02-01 00:00:00 UTC")));
}
Also used : ExpectedLogs(org.apache.beam.sdk.testing.ExpectedLogs) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Encoder(org.apache.avro.io.Encoder) ResultCoder(org.apache.beam.sdk.io.gcp.bigquery.WritePartition.ResultCoder) Matcher(java.util.regex.Matcher) DoFnTester(org.apache.beam.sdk.transforms.DoFnTester) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) EnumSet(java.util.EnumSet) ValueProvider(org.apache.beam.sdk.options.ValueProvider) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) KvCoder(org.apache.beam.sdk.coders.KvCoder) Matchers.allOf(org.hamcrest.Matchers.allOf) Set(java.util.Set) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) Serializable(java.io.Serializable) IncompatibleWindowException(org.apache.beam.sdk.transforms.windowing.IncompatibleWindowException) Assert.assertFalse(org.junit.Assert.assertFalse) AutoValue(com.google.auto.value.AutoValue) TestStream(org.apache.beam.sdk.testing.TestStream) Matchers.is(org.hamcrest.Matchers.is) DisplayDataMatchers.hasDisplayItem(org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem) Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method) Preconditions.checkNotNull(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull) KV(org.apache.beam.sdk.values.KV) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) View(org.apache.beam.sdk.transforms.View) ArrayList(java.util.ArrayList) GenericData(org.apache.avro.generic.GenericData) Distinct(org.apache.beam.sdk.transforms.Distinct) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) TupleTag(org.apache.beam.sdk.values.TupleTag) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) StreamSupport(java.util.stream.StreamSupport) JavaFieldSchema(org.apache.beam.sdk.schemas.JavaFieldSchema) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Row(org.apache.beam.sdk.values.Row) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteTables.Result) Before(org.junit.Before) TableReference(com.google.api.services.bigquery.model.TableReference) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Files(java.nio.file.Files) PAssert(org.apache.beam.sdk.testing.PAssert) NonMergingWindowFn(org.apache.beam.sdk.transforms.windowing.NonMergingWindowFn) Parameter(org.junit.runners.Parameterized.Parameter) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) ShardedKeyCoder(org.apache.beam.sdk.coders.ShardedKeyCoder) Test(org.junit.Test) Schema(org.apache.beam.sdk.schemas.Schema) File(java.io.File) Assert.assertNull(org.junit.Assert.assertNull) Paths(java.nio.file.Paths) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) AtomicCoder(org.apache.beam.sdk.coders.AtomicCoder) DefaultSchema(org.apache.beam.sdk.schemas.annotations.DefaultSchema) FakeJobService(org.apache.beam.sdk.io.gcp.testing.FakeJobService) Assert.assertEquals(org.junit.Assert.assertEquals) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) After(org.junit.After) TableRow(com.google.api.services.bigquery.model.TableRow) Assert.fail(org.junit.Assert.fail) TableSchema(com.google.api.services.bigquery.model.TableSchema) ArrayListMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ArrayListMultimap) ShardedKey(org.apache.beam.sdk.values.ShardedKey) Parameterized(org.junit.runners.Parameterized) MapElements(org.apache.beam.sdk.transforms.MapElements) DatumWriter(org.apache.avro.io.DatumWriter) Collection(java.util.Collection) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) Description(org.junit.runner.Description) Collectors(java.util.stream.Collectors) List(java.util.List) Clustering(com.google.api.services.bigquery.model.Clustering) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) TableDataInsertAllResponse(com.google.api.services.bigquery.model.TableDataInsertAllResponse) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pattern(java.util.regex.Pattern) ErrorProto(com.google.api.services.bigquery.model.ErrorProto) Statement(org.junit.runners.model.Statement) TestRule(org.junit.rules.TestRule) Parameters(org.junit.runners.Parameterized.Parameters) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) SerializableFunctions(org.apache.beam.sdk.transforms.SerializableFunctions) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) WindowMappingFn(org.apache.beam.sdk.transforms.windowing.WindowMappingFn) SchemaCreate(org.apache.beam.sdk.schemas.annotations.SchemaCreate) Job(com.google.api.services.bigquery.model.Job) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExpectedException(org.junit.rules.ExpectedException) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) OutputStream(java.io.OutputStream) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) GenericRecord(org.apache.avro.generic.GenericRecord) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) Matchers(org.hamcrest.Matchers) PCollection(org.apache.beam.sdk.values.PCollection) Table(com.google.api.services.bigquery.model.Table) Rule(org.junit.Rule) Instant(org.joda.time.Instant) Collections(java.util.Collections) JobConfigurationLoad(com.google.api.services.bigquery.model.JobConfigurationLoad) TemporaryFolder(org.junit.rules.TemporaryFolder) InputStream(java.io.InputStream) TableSchema(com.google.api.services.bigquery.model.TableSchema) JavaFieldSchema(org.apache.beam.sdk.schemas.JavaFieldSchema) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Schema(org.apache.beam.sdk.schemas.Schema) DefaultSchema(org.apache.beam.sdk.schemas.annotations.DefaultSchema) TableSchema(com.google.api.services.bigquery.model.TableSchema) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericData(org.apache.avro.generic.GenericData) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) DatumWriter(org.apache.avro.io.DatumWriter) Encoder(org.apache.avro.io.Encoder) TableRow(com.google.api.services.bigquery.model.TableRow) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 2 with DatumWriter

use of org.apache.avro.io.DatumWriter in project registry by hortonworks.

the class DefaultAvroSerDesHandler method handlePayloadSerialization.

@Override
public void handlePayloadSerialization(OutputStream outputStream, Object input) {
    try {
        Schema schema = AvroUtils.computeSchema(input);
        Schema.Type schemaType = schema.getType();
        if (Schema.Type.BYTES.equals(schemaType)) {
            // incase of byte arrays, no need to go through avro as there is not much to optimize and avro is expecting
            // the payload to be ByteBuffer instead of a byte array
            outputStream.write((byte[]) input);
        } else if (Schema.Type.STRING.equals(schemaType)) {
            // get UTF-8 bytes and directly send those over instead of using avro.
            outputStream.write(input.toString().getBytes("UTF-8"));
        } else {
            BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(outputStream, null);
            DatumWriter<Object> writer;
            boolean isSpecificRecord = input instanceof SpecificRecord;
            if (isSpecificRecord) {
                writer = new SpecificDatumWriter<>(schema);
            } else {
                writer = new GenericDatumWriter<>(schema);
            }
            writer.write(input, encoder);
            encoder.flush();
        }
    } catch (IOException e) {
        throw new AvroRetryableException(e);
    } catch (RuntimeException e) {
        throw new AvroException(e);
    }
}
Also used : GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) DatumWriter(org.apache.avro.io.DatumWriter) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter) BinaryEncoder(org.apache.avro.io.BinaryEncoder) AvroException(com.hortonworks.registries.schemaregistry.serdes.avro.exceptions.AvroException) SpecificRecord(org.apache.avro.specific.SpecificRecord) Schema(org.apache.avro.Schema) AvroRetryableException(com.hortonworks.registries.schemaregistry.serdes.avro.exceptions.AvroRetryableException) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) IOException(java.io.IOException) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter)

Example 3 with DatumWriter

use of org.apache.avro.io.DatumWriter in project hive by apache.

the class QTestMiniClusters method getAvroRows.

private static List<byte[]> getAvroRows() {
    int numRows = 10;
    List<byte[]> events;
    final DatumWriter<GenericRecord> writer = new SpecificDatumWriter<>(Wikipedia.getClassSchema());
    events = IntStream.rangeClosed(0, numRows).mapToObj(i -> Wikipedia.newBuilder().setTimestamp(formatter.format(new Timestamp(1534736225090L + 1000 * 3600 * i))).setAdded(i * 300).setDeleted(-i).setIsrobot(i % 2 == 0).setChannel("chanel number " + i).setComment("comment number " + i).setCommentlength(i).setDiffurl(String.format("url %s", i)).setFlags("flag").setIsminor(i % 2 > 0).setIsanonymous(i % 3 != 0).setNamespace("namespace").setIsunpatrolled(new Boolean(i % 3 == 0)).setIsnew(new Boolean(i % 2 > 0)).setPage(String.format("page is %s", i * 100)).setDelta(i).setDeltabucket(i * 100.4).setUser("test-user-" + i).build()).map(genericRecord -> {
        java.io.ByteArrayOutputStream out = new java.io.ByteArrayOutputStream();
        BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null);
        try {
            writer.write(genericRecord, encoder);
            encoder.flush();
            out.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return out.toByteArray();
    }).collect(Collectors.toList());
    return events;
}
Also used : URL(java.net.URL) FileSystem(org.apache.hadoop.fs.FileSystem) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) StringUtils(org.apache.commons.lang3.StringUtils) CuratorFrameworkSingleton(org.apache.hadoop.hive.ql.lockmgr.zookeeper.CuratorFrameworkSingleton) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) SingleNodeKafkaCluster(org.apache.hive.kafka.SingleNodeKafkaCluster) EnumSet(java.util.EnumSet) ZooKeeperHiveLockManager(org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLockManager) EncoderFactory(org.apache.avro.io.EncoderFactory) ZooKeeper(org.apache.zookeeper.ZooKeeper) HadoopShims(org.apache.hadoop.hive.shims.HadoopShims) DatumWriter(org.apache.avro.io.DatumWriter) CommonConfigurationKeysPublic(org.apache.hadoop.fs.CommonConfigurationKeysPublic) Timestamp(java.sql.Timestamp) Collectors(java.util.stream.Collectors) SessionState(org.apache.hadoop.hive.ql.session.SessionState) AbstractCliConfig(org.apache.hadoop.hive.cli.control.AbstractCliConfig) BinaryEncoder(org.apache.avro.io.BinaryEncoder) SparkSessionManagerImpl(org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManagerImpl) List(java.util.List) MiniDruidCluster(org.apache.hive.druid.MiniDruidCluster) IntStream(java.util.stream.IntStream) Wikipedia(org.apache.hive.kafka.Wikipedia) TezSessionState(org.apache.hadoop.hive.ql.exec.tez.TezSessionState) SparkSession(org.apache.hadoop.hive.ql.exec.spark.session.SparkSession) LlapItUtils(org.apache.hadoop.hive.llap.LlapItUtils) SimpleDateFormat(java.text.SimpleDateFormat) LlapProxy(org.apache.hadoop.hive.llap.io.api.LlapProxy) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter) MiniLlapCluster(org.apache.hadoop.hive.llap.daemon.MiniLlapCluster) HdfsErasureCodingShim(org.apache.hadoop.hive.shims.HadoopShims.HdfsErasureCodingShim) GenericRecord(org.apache.avro.generic.GenericRecord) MiniZooKeeperCluster(org.apache.hive.testutils.MiniZooKeeperCluster) Logger(org.slf4j.Logger) Files(java.nio.file.Files) Watcher(org.apache.zookeeper.Watcher) HiveConf(org.apache.hadoop.hive.conf.HiveConf) IOException(java.io.IOException) WatchedEvent(org.apache.zookeeper.WatchedEvent) CliSessionState(org.apache.hadoop.hive.cli.CliSessionState) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) Paths(java.nio.file.Paths) ShimLoader(org.apache.hadoop.hive.shims.ShimLoader) Preconditions(com.google.common.base.Preconditions) IOException(java.io.IOException) Timestamp(java.sql.Timestamp) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter) BinaryEncoder(org.apache.avro.io.BinaryEncoder) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

IOException (java.io.IOException)3 DatumWriter (org.apache.avro.io.DatumWriter)3 File (java.io.File)2 Files (java.nio.file.Files)2 Paths (java.nio.file.Paths)2 EnumSet (java.util.EnumSet)2 List (java.util.List)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 BinaryEncoder (org.apache.avro.io.BinaryEncoder)2 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)2 Clustering (com.google.api.services.bigquery.model.Clustering)1 ErrorProto (com.google.api.services.bigquery.model.ErrorProto)1 Job (com.google.api.services.bigquery.model.Job)1 JobConfigurationLoad (com.google.api.services.bigquery.model.JobConfigurationLoad)1 Table (com.google.api.services.bigquery.model.Table)1 TableDataInsertAllResponse (com.google.api.services.bigquery.model.TableDataInsertAllResponse)1 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)1