Search in sources :

Example 71 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class JdbcCommon method convertToAvroStream.

public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, final AvroConversionOptions options, final ResultSetRowCallback callback) throws SQLException, IOException {
    final Schema schema = createSchema(rs, options);
    final GenericRecord rec = new GenericData.Record(schema);
    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, outStream);
        final ResultSetMetaData meta = rs.getMetaData();
        final int nrOfColumns = meta.getColumnCount();
        long nrOfRows = 0;
        while (rs.next()) {
            if (callback != null) {
                callback.processRow(rs);
            }
            for (int i = 1; i <= nrOfColumns; i++) {
                final int javaSqlType = meta.getColumnType(i);
                final Schema fieldSchema = schema.getFields().get(i - 1).schema();
                // Need to handle CLOB and BLOB before getObject() is called, due to ResultSet's maximum portability statement
                if (javaSqlType == CLOB) {
                    Clob clob = rs.getClob(i);
                    if (clob != null) {
                        long numChars = clob.length();
                        char[] buffer = new char[(int) numChars];
                        InputStream is = clob.getAsciiStream();
                        int index = 0;
                        int c = is.read();
                        while (c >= 0) {
                            buffer[index++] = (char) c;
                            c = is.read();
                        }
                        rec.put(i - 1, new String(buffer));
                        clob.free();
                    } else {
                        rec.put(i - 1, null);
                    }
                    continue;
                }
                if (javaSqlType == NCLOB) {
                    NClob nClob = rs.getNClob(i);
                    if (nClob != null) {
                        final Reader characterStream = nClob.getCharacterStream();
                        long numChars = (int) nClob.length();
                        final CharBuffer buffer = CharBuffer.allocate((int) numChars);
                        characterStream.read(buffer);
                        buffer.flip();
                        rec.put(i - 1, buffer.toString());
                        nClob.free();
                    } else {
                        rec.put(i - 1, null);
                    }
                    continue;
                }
                if (javaSqlType == BLOB) {
                    Blob blob = rs.getBlob(i);
                    if (blob != null) {
                        long numChars = blob.length();
                        byte[] buffer = new byte[(int) numChars];
                        InputStream is = blob.getBinaryStream();
                        int index = 0;
                        int c = is.read();
                        while (c >= 0) {
                            buffer[index++] = (byte) c;
                            c = is.read();
                        }
                        ByteBuffer bb = ByteBuffer.wrap(buffer);
                        rec.put(i - 1, bb);
                        blob.free();
                    } else {
                        rec.put(i - 1, null);
                    }
                    continue;
                }
                final Object value = rs.getObject(i);
                if (value == null) {
                    rec.put(i - 1, null);
                } else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY || javaSqlType == ARRAY) {
                    // bytes requires little bit different handling
                    byte[] bytes = rs.getBytes(i);
                    ByteBuffer bb = ByteBuffer.wrap(bytes);
                    rec.put(i - 1, bb);
                } else if (value instanceof Byte) {
                    // tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
                    // But value is returned by JDBC as java.lang.Byte
                    // (at least H2 JDBC works this way)
                    // direct put to avro record results:
                    // org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
                    rec.put(i - 1, ((Byte) value).intValue());
                } else if (value instanceof Short) {
                    // MS SQL returns TINYINT as a Java Short, which Avro doesn't understand.
                    rec.put(i - 1, ((Short) value).intValue());
                } else if (value instanceof BigDecimal) {
                    if (options.useLogicalTypes) {
                        // Delegate mapping to AvroTypeUtil in order to utilize logical types.
                        rec.put(i - 1, AvroTypeUtil.convertToAvroObject(value, fieldSchema));
                    } else {
                        // As string for backward compatibility.
                        rec.put(i - 1, value.toString());
                    }
                } else if (value instanceof BigInteger) {
                    // such as: "Unknown datum type: java.math.BigInteger: 38". In this case the schema is expecting a string.
                    if (javaSqlType == BIGINT) {
                        int precision = meta.getPrecision(i);
                        if (precision < 0 || precision > MAX_DIGITS_IN_BIGINT) {
                            rec.put(i - 1, value.toString());
                        } else {
                            try {
                                rec.put(i - 1, ((BigInteger) value).longValueExact());
                            } catch (ArithmeticException ae) {
                                // Since the value won't fit in a long, convert it to a string
                                rec.put(i - 1, value.toString());
                            }
                        }
                    } else {
                        rec.put(i - 1, value.toString());
                    }
                } else if (value instanceof Number || value instanceof Boolean) {
                    if (javaSqlType == BIGINT) {
                        int precision = meta.getPrecision(i);
                        if (precision < 0 || precision > MAX_DIGITS_IN_BIGINT) {
                            rec.put(i - 1, value.toString());
                        } else {
                            rec.put(i - 1, value);
                        }
                    } else {
                        rec.put(i - 1, value);
                    }
                } else if (value instanceof Date) {
                    if (options.useLogicalTypes) {
                        // Delegate mapping to AvroTypeUtil in order to utilize logical types.
                        rec.put(i - 1, AvroTypeUtil.convertToAvroObject(value, fieldSchema));
                    } else {
                        // As string for backward compatibility.
                        rec.put(i - 1, value.toString());
                    }
                } else {
                    // The different types that we support are numbers (int, long, double, float),
                    // as well as boolean values and Strings. Since Avro doesn't provide
                    // timestamp types, we want to convert those to Strings. So we will cast anything other
                    // than numbers or booleans to strings by using the toString() method.
                    rec.put(i - 1, value.toString());
                }
            }
            dataFileWriter.append(rec);
            nrOfRows += 1;
            if (options.maxRows > 0 && nrOfRows == options.maxRows)
                break;
        }
        return nrOfRows;
    }
}
Also used : NClob(java.sql.NClob) Schema(org.apache.avro.Schema) CharBuffer(java.nio.CharBuffer) Reader(java.io.Reader) StringReader(java.io.StringReader) ResultSetMetaData(java.sql.ResultSetMetaData) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Blob(java.sql.Blob) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteBuffer(java.nio.ByteBuffer) BigDecimal(java.math.BigDecimal) Date(java.util.Date) LocalDate(java.time.LocalDate) BigInteger(java.math.BigInteger) NClob(java.sql.NClob) Clob(java.sql.Clob)

Example 72 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class JdbcCommon method createEmptyAvroStream.

public static void createEmptyAvroStream(final OutputStream outStream) throws IOException {
    final FieldAssembler<Schema> builder = SchemaBuilder.record("NiFi_ExecuteSQL_Record").namespace("any.data").fields();
    final Schema schema = builder.endRecord();
    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, outStream);
    }
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 73 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class TestAvroReaderWithEmbeddedSchema method testDataTypes.

@Test
public void testDataTypes() throws IOException, MalformedRecordException, SchemaNotFoundException {
    final List<Field> accountFields = new ArrayList<>();
    accountFields.add(new Field("accountId", Schema.create(Type.LONG), null, (Object) null));
    accountFields.add(new Field("accountName", Schema.create(Type.STRING), null, (Object) null));
    final Schema accountSchema = Schema.createRecord("account", null, null, false);
    accountSchema.setFields(accountFields);
    final List<Field> catFields = new ArrayList<>();
    catFields.add(new Field("catTailLength", Schema.create(Type.INT), null, (Object) null));
    catFields.add(new Field("catName", Schema.create(Type.STRING), null, (Object) null));
    final Schema catSchema = Schema.createRecord("cat", null, null, false);
    catSchema.setFields(catFields);
    final List<Field> dogFields = new ArrayList<>();
    dogFields.add(new Field("dogTailLength", Schema.create(Type.INT), null, (Object) null));
    dogFields.add(new Field("dogName", Schema.create(Type.STRING), null, (Object) null));
    final Schema dogSchema = Schema.createRecord("dog", null, null, false);
    dogSchema.setFields(dogFields);
    final List<Field> fields = new ArrayList<>();
    fields.add(new Field("name", Schema.create(Type.STRING), null, (Object) null));
    fields.add(new Field("age", Schema.create(Type.INT), null, (Object) null));
    fields.add(new Field("balance", Schema.create(Type.DOUBLE), null, (Object) null));
    fields.add(new Field("rate", Schema.create(Type.FLOAT), null, (Object) null));
    fields.add(new Field("debt", Schema.create(Type.BOOLEAN), null, (Object) null));
    fields.add(new Field("nickname", Schema.create(Type.NULL), null, (Object) null));
    fields.add(new Field("binary", Schema.create(Type.BYTES), null, (Object) null));
    fields.add(new Field("fixed", Schema.createFixed("fixed", null, null, 5), null, (Object) null));
    fields.add(new Field("map", Schema.createMap(Schema.create(Type.STRING)), null, (Object) null));
    fields.add(new Field("array", Schema.createArray(Schema.create(Type.LONG)), null, (Object) null));
    fields.add(new Field("account", accountSchema, null, (Object) null));
    fields.add(new Field("desiredbalance", // test union of NULL and other type with no value
    Schema.createUnion(Arrays.asList(Schema.create(Type.NULL), Schema.create(Type.DOUBLE))), null, (Object) null));
    fields.add(new Field("dreambalance", // test union of NULL and other type with a value
    Schema.createUnion(Arrays.asList(Schema.create(Type.NULL), Schema.create(Type.DOUBLE))), null, (Object) null));
    fields.add(new Field("favAnimal", Schema.createUnion(Arrays.asList(catSchema, dogSchema)), null, (Object) null));
    fields.add(new Field("otherFavAnimal", Schema.createUnion(Arrays.asList(catSchema, dogSchema)), null, (Object) null));
    final Schema schema = Schema.createRecord("record", null, null, false);
    schema.setFields(fields);
    final byte[] source;
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    final Map<String, String> map = new HashMap<>();
    map.put("greeting", "hello");
    map.put("salutation", "good-bye");
    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
        final DataFileWriter<GenericRecord> writer = dataFileWriter.create(schema, baos)) {
        final GenericRecord record = new GenericData.Record(schema);
        record.put("name", "John");
        record.put("age", 33);
        record.put("balance", 1234.56D);
        record.put("rate", 0.045F);
        record.put("debt", false);
        record.put("binary", ByteBuffer.wrap("binary".getBytes(StandardCharsets.UTF_8)));
        record.put("fixed", new GenericData.Fixed(Schema.create(Type.BYTES), "fixed".getBytes(StandardCharsets.UTF_8)));
        record.put("map", map);
        record.put("array", Arrays.asList(1L, 2L));
        record.put("dreambalance", 10_000_000.00D);
        final GenericRecord accountRecord = new GenericData.Record(accountSchema);
        accountRecord.put("accountId", 83L);
        accountRecord.put("accountName", "Checking");
        record.put("account", accountRecord);
        final GenericRecord catRecord = new GenericData.Record(catSchema);
        catRecord.put("catTailLength", 1);
        catRecord.put("catName", "Meow");
        record.put("otherFavAnimal", catRecord);
        final GenericRecord dogRecord = new GenericData.Record(dogSchema);
        dogRecord.put("dogTailLength", 14);
        dogRecord.put("dogName", "Fido");
        record.put("favAnimal", dogRecord);
        writer.append(record);
    }
    source = baos.toByteArray();
    try (final InputStream in = new ByteArrayInputStream(source)) {
        final AvroRecordReader reader = new AvroReaderWithEmbeddedSchema(in);
        final RecordSchema recordSchema = reader.getSchema();
        assertEquals(15, recordSchema.getFieldCount());
        assertEquals(RecordFieldType.STRING, recordSchema.getDataType("name").get().getFieldType());
        assertEquals(RecordFieldType.INT, recordSchema.getDataType("age").get().getFieldType());
        assertEquals(RecordFieldType.DOUBLE, recordSchema.getDataType("balance").get().getFieldType());
        assertEquals(RecordFieldType.FLOAT, recordSchema.getDataType("rate").get().getFieldType());
        assertEquals(RecordFieldType.BOOLEAN, recordSchema.getDataType("debt").get().getFieldType());
        assertEquals(RecordFieldType.STRING, recordSchema.getDataType("nickname").get().getFieldType());
        assertEquals(RecordFieldType.ARRAY, recordSchema.getDataType("binary").get().getFieldType());
        assertEquals(RecordFieldType.ARRAY, recordSchema.getDataType("fixed").get().getFieldType());
        assertEquals(RecordFieldType.MAP, recordSchema.getDataType("map").get().getFieldType());
        assertEquals(RecordFieldType.ARRAY, recordSchema.getDataType("array").get().getFieldType());
        assertEquals(RecordFieldType.RECORD, recordSchema.getDataType("account").get().getFieldType());
        assertEquals(RecordFieldType.DOUBLE, recordSchema.getDataType("desiredbalance").get().getFieldType());
        assertEquals(RecordFieldType.DOUBLE, recordSchema.getDataType("dreambalance").get().getFieldType());
        assertEquals(RecordFieldType.CHOICE, recordSchema.getDataType("favAnimal").get().getFieldType());
        assertEquals(RecordFieldType.CHOICE, recordSchema.getDataType("otherFavAnimal").get().getFieldType());
        final Object[] values = reader.nextRecord().getValues();
        assertEquals(15, values.length);
        assertEquals("John", values[0]);
        assertEquals(33, values[1]);
        assertEquals(1234.56D, values[2]);
        assertEquals(0.045F, values[3]);
        assertEquals(false, values[4]);
        assertEquals(null, values[5]);
        assertArrayEquals(toObjectArray("binary".getBytes(StandardCharsets.UTF_8)), (Object[]) values[6]);
        assertArrayEquals(toObjectArray("fixed".getBytes(StandardCharsets.UTF_8)), (Object[]) values[7]);
        assertEquals(map, values[8]);
        assertArrayEquals(new Object[] { 1L, 2L }, (Object[]) values[9]);
        final Map<String, Object> accountValues = new HashMap<>();
        accountValues.put("accountName", "Checking");
        accountValues.put("accountId", 83L);
        final List<RecordField> accountRecordFields = new ArrayList<>();
        accountRecordFields.add(new RecordField("accountId", RecordFieldType.LONG.getDataType(), false));
        accountRecordFields.add(new RecordField("accountName", RecordFieldType.STRING.getDataType(), false));
        final RecordSchema accountRecordSchema = new SimpleRecordSchema(accountRecordFields);
        final Record mapRecord = new MapRecord(accountRecordSchema, accountValues);
        assertEquals(mapRecord, values[10]);
        assertNull(values[11]);
        assertEquals(10_000_000.0D, values[12]);
        final Map<String, Object> dogMap = new HashMap<>();
        dogMap.put("dogName", "Fido");
        dogMap.put("dogTailLength", 14);
        final List<RecordField> dogRecordFields = new ArrayList<>();
        dogRecordFields.add(new RecordField("dogTailLength", RecordFieldType.INT.getDataType(), false));
        dogRecordFields.add(new RecordField("dogName", RecordFieldType.STRING.getDataType(), false));
        final RecordSchema dogRecordSchema = new SimpleRecordSchema(dogRecordFields);
        final Record dogRecord = new MapRecord(dogRecordSchema, dogMap);
        assertEquals(dogRecord, values[13]);
        final Map<String, Object> catMap = new HashMap<>();
        catMap.put("catName", "Meow");
        catMap.put("catTailLength", 1);
        final List<RecordField> catRecordFields = new ArrayList<>();
        catRecordFields.add(new RecordField("catTailLength", RecordFieldType.INT.getDataType(), false));
        catRecordFields.add(new RecordField("catName", RecordFieldType.STRING.getDataType(), false));
        final RecordSchema catRecordSchema = new SimpleRecordSchema(catRecordFields);
        final Record catRecord = new MapRecord(catRecordSchema, catMap);
        assertEquals(catRecord, values[14]);
    }
}
Also used : SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) RecordField(org.apache.nifi.serialization.record.RecordField) HashMap(java.util.HashMap) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) Schema(org.apache.avro.Schema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) ArrayList(java.util.ArrayList) Field(org.apache.avro.Schema.Field) RecordField(org.apache.nifi.serialization.record.RecordField) Record(org.apache.nifi.serialization.record.Record) MapRecord(org.apache.nifi.serialization.record.MapRecord) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) SimpleRecordSchema(org.apache.nifi.serialization.SimpleRecordSchema) MapRecord(org.apache.nifi.serialization.record.MapRecord) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericData(org.apache.avro.generic.GenericData) ByteArrayInputStream(java.io.ByteArrayInputStream) Test(org.junit.Test)

Example 74 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project kafka-connect-storage-cloud by confluentinc.

the class S3SinkTaskTest method calcByteSize.

private int calcByteSize(List<SinkRecord> sinkRecords) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    DataFileWriter<Object> writer = new DataFileWriter<>(new GenericDatumWriter<>());
    AvroData avroData = new AvroData(1);
    boolean writerInit = false;
    for (SinkRecord sinkRecord : sinkRecords) {
        if (!writerInit) {
            writer.create(avroData.fromConnectSchema(sinkRecord.valueSchema()), baos);
            writerInit = true;
        }
        writer.append(avroData.fromConnectData(sinkRecord.valueSchema(), sinkRecord.value()));
    }
    return baos.size();
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) AvroData(io.confluent.connect.avro.AvroData) SinkRecord(org.apache.kafka.connect.sink.SinkRecord)

Example 75 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project kafka-connect-storage-cloud by confluentinc.

the class AvroUtils method putRecords.

public static byte[] putRecords(Collection<SinkRecord> records, AvroData avroData) throws IOException {
    final DataFileWriter<Object> writer = new DataFileWriter<>(new GenericDatumWriter<>());
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    Schema schema = null;
    for (SinkRecord record : records) {
        if (schema == null) {
            schema = record.valueSchema();
            org.apache.avro.Schema avroSchema = avroData.fromConnectSchema(schema);
            writer.create(avroSchema, out);
        }
        Object value = avroData.fromConnectData(schema, record.value());
        // NonRecordContainers to just their value to properly handle these types
        if (value instanceof NonRecordContainer) {
            value = ((NonRecordContainer) value).getValue();
        }
        writer.append(value);
    }
    writer.flush();
    return out.toByteArray();
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.kafka.connect.data.Schema) ByteArrayOutputStream(java.io.ByteArrayOutputStream) SinkRecord(org.apache.kafka.connect.sink.SinkRecord) NonRecordContainer(io.confluent.kafka.serializers.NonRecordContainer)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)102 GenericRecord (org.apache.avro.generic.GenericRecord)58 Schema (org.apache.avro.Schema)50 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)47 File (java.io.File)38 ByteArrayOutputStream (java.io.ByteArrayOutputStream)22 IOException (java.io.IOException)22 GenericData (org.apache.avro.generic.GenericData)17 FileOutputStream (java.io.FileOutputStream)15 Test (org.junit.Test)14 HashMap (java.util.HashMap)11 InputStream (java.io.InputStream)10 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)10 ArrayList (java.util.ArrayList)9 Path (org.apache.hadoop.fs.Path)9 ByteArrayInputStream (java.io.ByteArrayInputStream)8 OutputStream (java.io.OutputStream)8 ByteBuffer (java.nio.ByteBuffer)7 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)7 MockFlowFile (org.apache.nifi.util.MockFlowFile)7