Search in sources :

Example 71 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project incubator-gobblin by apache.

the class EmbeddedGobblinDistcpTest method testCheckSchema.

@Test
public void testCheckSchema() throws Exception {
    Schema schema = null;
    try (InputStream is = GobblinMetricsPinotFlattenerConverter.class.getClassLoader().getResourceAsStream("avroSchemaManagerTest/expectedSchema.avsc")) {
        schema = new Schema.Parser().parse(is);
    } catch (IOException e) {
        e.printStackTrace();
    }
    String fileName = "file.avro";
    File tmpSource = Files.createTempDir();
    tmpSource.deleteOnExit();
    File tmpTarget = Files.createTempDir();
    tmpTarget.deleteOnExit();
    File tmpFile = new File(tmpSource, fileName);
    tmpFile.createNewFile();
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(schema, tmpFile);
    for (int i = 0; i < 100; i++) {
        GenericRecord record = new GenericData.Record(schema);
        record.put("foo", i);
        dataFileWriter.append(record);
    }
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertFalse(new File(tmpTarget, fileName).exists());
    EmbeddedGobblinDistcp embedded = new EmbeddedGobblinDistcp(new Path(tmpSource.getAbsolutePath()), new Path(tmpTarget.getAbsolutePath()));
    embedded.setConfiguration(CopySource.SCHEMA_CHECK_ENABLED, "true");
    embedded.setLaunchTimeout(30, TimeUnit.SECONDS);
    embedded.setConfiguration(ConfigurationKeys.SOURCE_CLASS_KEY, SchemaCheckedCopySource.class.getName());
    embedded.setConfiguration(ConfigurationKeys.AVRO_SCHEMA_CHECK_STRATEGY, "org.apache.gobblin.util.schema_check.AvroSchemaCheckDefaultStrategy");
    // test when schema is not the expected one, the job will be aborted.
    embedded.setConfiguration(ConfigurationKeys.COPY_EXPECTED_SCHEMA, "{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo1\",\"type\":[\"null\",\"int\"],\"doc\":\"this is for test\",\"default\":null}]}");
    JobExecutionResult result = embedded.run();
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertFalse(result.isSuccessful());
    Assert.assertFalse(new File(tmpTarget, fileName).exists());
    embedded.setConfiguration(ConfigurationKeys.COPY_EXPECTED_SCHEMA, "{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"string\",\"int\"],\"doc\":\"this is for test\",\"default\":null}]}");
    result = embedded.run();
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertFalse(result.isSuccessful());
    Assert.assertFalse(new File(tmpTarget, fileName).exists());
    // test when schema is the expected one, the job will succeed.
    embedded.setConfiguration(ConfigurationKeys.COPY_EXPECTED_SCHEMA, "{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"null\",\"int\"],\"doc\":\"this is for test\",\"default\":null}]}");
    result = embedded.run();
    Assert.assertTrue(result.isSuccessful());
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertTrue(new File(tmpTarget, fileName).exists());
}
Also used : Path(org.apache.hadoop.fs.Path) InputStream(java.io.InputStream) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) IOException(java.io.IOException) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) JobExecutionResult(org.apache.gobblin.runtime.api.JobExecutionResult) GobblinMetricsPinotFlattenerConverter(org.apache.gobblin.converter.GobblinMetricsPinotFlattenerConverter) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) SchemaCheckedCopySource(org.apache.gobblin.data.management.copy.SchemaCheckedCopySource) Test(org.testng.annotations.Test)

Example 72 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project incubator-gobblin by apache.

the class LiAvroSerializerBase method serialize.

public byte[] serialize(String topic, GenericRecord data) throws SerializationException {
    Schema schema = data.getSchema();
    MD5Digest schemaId = null;
    try {
        schemaId = schemaRegistry.register(topic, schema);
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        // MAGIC_BYTE | schemaId-bytes | avro_payload
        out.write(LiAvroSerDeHelper.MAGIC_BYTE);
        out.write(schemaId.asBytes());
        BinaryEncoder encoder = encoderFactory.directBinaryEncoder(out, null);
        DatumWriter<GenericRecord> writer = new GenericDatumWriter<>(schema);
        writer.write(data, encoder);
        encoder.flush();
        byte[] bytes = out.toByteArray();
        out.close();
        return bytes;
    } catch (IOException | SchemaRegistryException e) {
        throw new SerializationException(e);
    }
}
Also used : BinaryEncoder(org.apache.avro.io.BinaryEncoder) Schema(org.apache.avro.Schema) SchemaRegistryException(org.apache.gobblin.kafka.schemareg.SchemaRegistryException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 73 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project crunch by cloudera.

the class AvroFileReaderFactoryTest method populateGenericFile.

private void populateGenericFile(List<GenericRecord> genericRecords, Schema outputSchema) throws IOException {
    FileOutputStream outputStream = new FileOutputStream(this.avroFile);
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(outputSchema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
    dataFileWriter.create(outputSchema, outputStream);
    for (GenericRecord record : genericRecords) {
        dataFileWriter.append(record);
    }
    dataFileWriter.close();
    outputStream.close();
}
Also used : FileOutputStream(java.io.FileOutputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 74 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project druid by druid-io.

the class AvroStreamInputRowParserTest method testParse.

@Test
public void testParse() throws SchemaValidationException, IOException {
    // serde test
    Repository repository = new InMemoryRepository(null);
    AvroStreamInputRowParser parser = new AvroStreamInputRowParser(PARSE_SPEC, new SchemaRepoBasedAvroBytesDecoder<String, Integer>(new Avro1124SubjectAndIdConverter(TOPIC), repository));
    ByteBufferInputRowParser parser2 = jsonMapper.readValue(jsonMapper.writeValueAsString(parser), ByteBufferInputRowParser.class);
    repository = ((SchemaRepoBasedAvroBytesDecoder) ((AvroStreamInputRowParser) parser2).getAvroBytesDecoder()).getSchemaRepository();
    // prepare data
    GenericRecord someAvroDatum = buildSomeAvroDatum();
    // encode schema id
    Avro1124SubjectAndIdConverter converter = new Avro1124SubjectAndIdConverter(TOPIC);
    TypedSchemaRepository<Integer, Schema, String> repositoryClient = new TypedSchemaRepository<Integer, Schema, String>(repository, new IntegerConverter(), new AvroSchemaConverter(), new IdentityConverter());
    Integer id = repositoryClient.registerSchema(TOPIC, SomeAvroDatum.getClassSchema());
    ByteBuffer byteBuffer = ByteBuffer.allocate(4);
    converter.putSubjectAndId(TOPIC, id, byteBuffer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    out.write(byteBuffer.array());
    // encode data
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(someAvroDatum.getSchema());
    // write avro datum to bytes
    writer.write(someAvroDatum, EncoderFactory.get().directBinaryEncoder(out, null));
    InputRow inputRow = parser2.parse(ByteBuffer.wrap(out.toByteArray()));
    assertInputRowCorrect(inputRow);
}
Also used : Avro1124SubjectAndIdConverter(io.druid.data.input.schemarepo.Avro1124SubjectAndIdConverter) AvroSchemaConverter(org.schemarepo.api.converter.AvroSchemaConverter) InMemoryRepository(org.schemarepo.InMemoryRepository) TypedSchemaRepository(org.schemarepo.api.TypedSchemaRepository) Schema(org.apache.avro.Schema) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteBuffer(java.nio.ByteBuffer) IntegerConverter(org.schemarepo.api.converter.IntegerConverter) Repository(org.schemarepo.Repository) InMemoryRepository(org.schemarepo.InMemoryRepository) TypedSchemaRepository(org.schemarepo.api.TypedSchemaRepository) IdentityConverter(org.schemarepo.api.converter.IdentityConverter) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 75 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project druid by druid-io.

the class InlineSchemasAvroBytesDecoderTest method testParse.

@Test
public void testParse() throws Exception {
    GenericRecord someAvroDatum = AvroStreamInputRowParserTest.buildSomeAvroDatum();
    Schema schema = SomeAvroDatum.getClassSchema();
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    out.write(new byte[] { 1 });
    out.write(ByteBuffer.allocate(4).putInt(10).array());
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
    writer.write(someAvroDatum, EncoderFactory.get().directBinaryEncoder(out, null));
    GenericRecord actual = new InlineSchemasAvroBytesDecoder(ImmutableMap.of(10, schema)).parse(ByteBuffer.wrap(out.toByteArray()));
    Assert.assertEquals(someAvroDatum.get("id"), actual.get("id"));
}
Also used : Schema(org.apache.avro.Schema) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test) AvroStreamInputRowParserTest(io.druid.data.input.AvroStreamInputRowParserTest)

Aggregations

GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)127 GenericRecord (org.apache.avro.generic.GenericRecord)105 Schema (org.apache.avro.Schema)69 ByteArrayOutputStream (java.io.ByteArrayOutputStream)57 DataFileWriter (org.apache.avro.file.DataFileWriter)47 File (java.io.File)40 Test (org.junit.Test)37 IOException (java.io.IOException)29 BinaryEncoder (org.apache.avro.io.BinaryEncoder)29 MockFlowFile (org.apache.nifi.util.MockFlowFile)25 Encoder (org.apache.avro.io.Encoder)23 TestRunner (org.apache.nifi.util.TestRunner)20 HashMap (java.util.HashMap)14 ByteArrayOutputStream (org.apache.nifi.stream.io.ByteArrayOutputStream)14 GenericData (org.apache.avro.generic.GenericData)12 ByteArrayInputStream (java.io.ByteArrayInputStream)11 FileOutputStream (java.io.FileOutputStream)10 InputStream (java.io.InputStream)9 ArrayList (java.util.ArrayList)8 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)8