Search in sources :

Example 1 with GenericData

use of org.apache.avro.generic.GenericData in project nifi by apache.

the class ConvertAvroToJSON method onTrigger.

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final String containerOption = context.getProperty(CONTAINER_OPTIONS).getValue();
    final boolean useContainer = containerOption.equals(CONTAINER_ARRAY);
    // Wrap a single record (inclusive of no records) only when a container is being used
    final boolean wrapSingleRecord = context.getProperty(WRAP_SINGLE_RECORD).asBoolean() && useContainer;
    final String stringSchema = context.getProperty(SCHEMA).getValue();
    final boolean schemaLess = stringSchema != null;
    try {
        flowFile = session.write(flowFile, new StreamCallback() {

            @Override
            public void process(final InputStream rawIn, final OutputStream rawOut) throws IOException {
                final GenericData genericData = GenericData.get();
                if (schemaLess) {
                    if (schema == null) {
                        schema = new Schema.Parser().parse(stringSchema);
                    }
                    try (final InputStream in = new BufferedInputStream(rawIn);
                        final OutputStream out = new BufferedOutputStream(rawOut)) {
                        final DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
                        final BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(in, null);
                        final GenericRecord record = reader.read(null, decoder);
                        // need to be true before we wrap it with an array
                        if (useContainer && wrapSingleRecord) {
                            out.write('[');
                        }
                        final byte[] outputBytes = (record == null) ? EMPTY_JSON_OBJECT : genericData.toString(record).getBytes(StandardCharsets.UTF_8);
                        out.write(outputBytes);
                        if (useContainer && wrapSingleRecord) {
                            out.write(']');
                        }
                    }
                } else {
                    try (final InputStream in = new BufferedInputStream(rawIn);
                        final OutputStream out = new BufferedOutputStream(rawOut);
                        final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
                        int recordCount = 0;
                        GenericRecord currRecord = null;
                        if (reader.hasNext()) {
                            currRecord = reader.next();
                            recordCount++;
                        }
                        // if configured to wrap single record
                        if (reader.hasNext() && useContainer || wrapSingleRecord) {
                            out.write('[');
                        }
                        // Determine the initial output record, inclusive if we should have an empty set of Avro records
                        final byte[] outputBytes = (currRecord == null) ? EMPTY_JSON_OBJECT : genericData.toString(currRecord).getBytes(StandardCharsets.UTF_8);
                        out.write(outputBytes);
                        while (reader.hasNext()) {
                            if (useContainer) {
                                out.write(',');
                            } else {
                                out.write('\n');
                            }
                            currRecord = reader.next(currRecord);
                            out.write(genericData.toString(currRecord).getBytes(StandardCharsets.UTF_8));
                            recordCount++;
                        }
                        // configured to wrap a single record
                        if (recordCount > 1 && useContainer || wrapSingleRecord) {
                            out.write(']');
                        }
                    }
                }
            }
        });
    } catch (final ProcessException pe) {
        getLogger().error("Failed to convert {} from Avro to JSON due to {}; transferring to failure", new Object[] { flowFile, pe });
        session.transfer(flowFile, REL_FAILURE);
        return;
    }
    flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/json");
    session.transfer(flowFile, REL_SUCCESS);
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) DataFileStream(org.apache.avro.file.DataFileStream) GenericData(org.apache.avro.generic.GenericData) StreamCallback(org.apache.nifi.processor.io.StreamCallback) BinaryDecoder(org.apache.avro.io.BinaryDecoder) ProcessException(org.apache.nifi.processor.exception.ProcessException) BufferedInputStream(java.io.BufferedInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) BufferedOutputStream(java.io.BufferedOutputStream)

Example 2 with GenericData

use of org.apache.avro.generic.GenericData in project incubator-gobblin by apache.

the class AvroGenericRecordAccessor method set.

/*
   * Recurse down record types to set the right value
   */
private void set(String fieldName, Object value) {
    try {
        String subField;
        Iterator<String> levels = Splitter.on(".").split(fieldName).iterator();
        GenericRecord toInsert = record;
        subField = levels.next();
        Object subRecord = toInsert;
        while (levels.hasNext()) {
            if (subRecord instanceof GenericRecord) {
                subRecord = ((GenericRecord) subRecord).get(subField);
            } else if (subRecord instanceof List) {
                subRecord = ((List) subRecord).get(Integer.parseInt(subField));
            } else if (subRecord instanceof Map) {
                subRecord = ((Map) subRecord).get(subField);
            }
            if (subRecord == null) {
                throw new FieldDoesNotExistException("Field " + subField + " not found when trying to set " + fieldName);
            }
            subField = levels.next();
        }
        if (!(subRecord instanceof GenericRecord)) {
            throw new IllegalArgumentException("Field " + fieldName + " does not refer to a record type.");
        }
        toInsert = (GenericRecord) subRecord;
        Object oldValue = toInsert.get(subField);
        toInsert.put(subField, value);
        Schema.Field changedField = toInsert.getSchema().getField(subField);
        GenericData genericData = GenericData.get();
        boolean valid = genericData.validate(changedField.schema(), genericData.getField(toInsert, changedField.name(), changedField.pos()));
        if (!valid) {
            toInsert.put(subField, oldValue);
            throw new IncorrectTypeException("Incorrect type - can't insert a " + value.getClass().getCanonicalName() + " into an Avro record of type " + changedField.schema().getType().toString());
        }
    } catch (AvroRuntimeException e) {
        throw new FieldDoesNotExistException("Field not found setting name " + fieldName, e);
    }
}
Also used : Schema(org.apache.avro.Schema) AvroRuntimeException(org.apache.avro.AvroRuntimeException) GenericData(org.apache.avro.generic.GenericData) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) Map(java.util.Map) HashMap(java.util.HashMap)

Example 3 with GenericData

use of org.apache.avro.generic.GenericData in project parquet-mr by apache.

the class AvroReadSupport method prepareForRead.

@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext) {
    Map<String, String> metadata = readContext.getReadSupportMetadata();
    MessageType parquetSchema = readContext.getRequestedSchema();
    Schema avroSchema;
    if (metadata.get(AVRO_READ_SCHEMA_METADATA_KEY) != null) {
        // use the Avro read schema provided by the user
        avroSchema = new Schema.Parser().parse(metadata.get(AVRO_READ_SCHEMA_METADATA_KEY));
    } else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) {
        // use the Avro schema from the file metadata if present
        avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY));
    } else if (keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY) != null) {
        // use the Avro schema from the file metadata if present
        avroSchema = new Schema.Parser().parse(keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY));
    } else {
        // default to converting the Parquet schema into an Avro schema
        avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema);
    }
    GenericData model = getDataModel(configuration);
    String compatEnabled = metadata.get(AvroReadSupport.AVRO_COMPATIBILITY);
    if (compatEnabled != null && Boolean.valueOf(compatEnabled)) {
        return newCompatMaterializer(parquetSchema, avroSchema, model);
    }
    return new AvroRecordMaterializer<T>(parquetSchema, avroSchema, model);
}
Also used : Schema(org.apache.avro.Schema) GenericData(org.apache.avro.generic.GenericData) MessageType(org.apache.parquet.schema.MessageType)

Example 4 with GenericData

use of org.apache.avro.generic.GenericData in project parquet-mr by apache.

the class TestReadWrite method testDecimalValues.

@Test
public void testDecimalValues() throws Exception {
    Schema decimalSchema = Schema.createRecord("myrecord", null, null, false);
    Schema decimal = LogicalTypes.decimal(9, 2).addToSchema(Schema.create(Schema.Type.BYTES));
    decimalSchema.setFields(Collections.singletonList(new Schema.Field("dec", decimal, null, null)));
    // add the decimal conversion to a generic data model
    GenericData decimalSupport = new GenericData();
    decimalSupport.addLogicalTypeConversion(new Conversions.DecimalConversion());
    File file = temp.newFile("decimal.parquet");
    file.delete();
    Path path = new Path(file.toString());
    ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(path).withDataModel(decimalSupport).withSchema(decimalSchema).build();
    Random random = new Random(34L);
    GenericRecordBuilder builder = new GenericRecordBuilder(decimalSchema);
    List<GenericRecord> expected = Lists.newArrayList();
    for (int i = 0; i < 1000; i += 1) {
        BigDecimal dec = new BigDecimal(new BigInteger(31, random), 2);
        builder.set("dec", dec);
        GenericRecord rec = builder.build();
        expected.add(rec);
        writer.write(builder.build());
    }
    writer.close();
    ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(path).withDataModel(decimalSupport).disableCompatibility().build();
    List<GenericRecord> records = Lists.newArrayList();
    GenericRecord rec;
    while ((rec = reader.read()) != null) {
        records.add(rec);
    }
    reader.close();
    Assert.assertTrue("dec field should be a BigDecimal instance", records.get(0).get("dec") instanceof BigDecimal);
    Assert.assertEquals("Content should match", expected, records);
}
Also used : Path(org.apache.hadoop.fs.Path) Schema(org.apache.avro.Schema) GenericData(org.apache.avro.generic.GenericData) BigDecimal(java.math.BigDecimal) Conversions(org.apache.avro.Conversions) Random(java.util.Random) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) BigInteger(java.math.BigInteger) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.junit.Test)

Example 5 with GenericData

use of org.apache.avro.generic.GenericData in project flink by apache.

the class AvroSerializationSchema method checkAvroInitialized.

protected void checkAvroInitialized() {
    if (datumWriter != null) {
        return;
    }
    ClassLoader cl = Thread.currentThread().getContextClassLoader();
    if (SpecificRecord.class.isAssignableFrom(recordClazz)) {
        Schema schema = SpecificData.get().getSchema(recordClazz);
        this.datumWriter = new SpecificDatumWriter<>(schema);
        this.schema = schema;
    } else {
        this.schema = new Schema.Parser().parse(this.schemaString);
        GenericData genericData = new GenericData(cl);
        this.datumWriter = new GenericDatumWriter<>(schema, genericData);
    }
    this.arrayOutputStream = new ByteArrayOutputStream();
    this.encoder = EncoderFactory.get().directBinaryEncoder(arrayOutputStream, null);
}
Also used : Schema(org.apache.avro.Schema) SerializationSchema(org.apache.flink.api.common.serialization.SerializationSchema) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericData(org.apache.avro.generic.GenericData)

Aggregations

GenericData (org.apache.avro.generic.GenericData)11 Schema (org.apache.avro.Schema)7 GenericRecord (org.apache.avro.generic.GenericRecord)5 Conversions (org.apache.avro.Conversions)4 File (java.io.File)3 BigDecimal (java.math.BigDecimal)3 DataFileStream (org.apache.avro.file.DataFileStream)3 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)3 Test (org.junit.Test)3 ByteArrayOutputStream (java.io.ByteArrayOutputStream)2 InputStream (java.io.InputStream)2 BigInteger (java.math.BigInteger)2 Random (java.util.Random)2 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)2 Path (org.apache.hadoop.fs.Path)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 BufferedInputStream (java.io.BufferedInputStream)1 BufferedOutputStream (java.io.BufferedOutputStream)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1