Search in sources :

Example 76 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project h2o-3 by h2oai.

the class AvroFileGenerator method generateUnionTypes.

public static File generateUnionTypes(String filename, int nrows) throws IOException {
    File parentDir = Files.createTempDir();
    File f = new File(parentDir, filename);
    DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
    DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
    // Based on SchemaBuilder javadoc:
    // * The below two field declarations are equivalent:
    // * <pre>
    // *  .name("f").type().unionOf().nullType().and().longType().endUnion().nullDefault()
    // *  .name("f").type().optional().longType()
    // * </pre>
    Schema schema = SchemaBuilder.builder().record("test_union_types").fields().name("CUString").type().optional().stringType().name("CUBytes").type().optional().bytesType().name("CUInt").type().optional().intType().name("CULong").type().optional().longType().name("CUFloat").type().optional().floatType().name("CUDouble").type().optional().doubleType().name("CUBoolean").type().optional().booleanType().endRecord();
    try {
        dw.create(schema, f);
        for (int i = 0; i < nrows; i++) {
            GenericRecord gr = new GenericData.Record(schema);
            gr.put("CUString", i == 0 ? null : String.valueOf(i));
            gr.put("CUBytes", i == 0 ? null : ByteBuffer.wrap(StringUtils.toBytes(i)));
            gr.put("CUInt", i == 0 ? null : i);
            gr.put("CULong", i == 0 ? null : Long.valueOf(i));
            gr.put("CUFloat", i == 0 ? null : Float.valueOf(i));
            gr.put("CUDouble", i == 0 ? null : Double.valueOf(i));
            gr.put("CUBoolean", i == 0 ? null : (i & 1) == 1);
            dw.append(gr);
        }
        return f;
    } finally {
        dw.close();
        ;
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 77 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project h2o-3 by h2oai.

the class AvroFileGenerator method generatePrimitiveTypes.

public static File generatePrimitiveTypes(String filename, int nrows) throws IOException {
    File parentDir = Files.createTempDir();
    File f = new File(parentDir, filename);
    // Write output records
    DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
    DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
    Schema schema = SchemaBuilder.builder().record("test_primitive_types").fields().name("CString").type("string").noDefault().name("CBytes").type("bytes").noDefault().name("CInt").type("int").noDefault().name("CLong").type("long").noDefault().name("CFloat").type("float").noDefault().name("CDouble").type("double").noDefault().name("CBoolean").type("boolean").noDefault().name("CNull").type("null").noDefault().endRecord();
    try {
        dw.create(schema, f);
        for (int i = 0; i < nrows; i++) {
            GenericRecord gr = new GenericData.Record(schema);
            gr.put("CString", String.valueOf(i));
            gr.put("CBytes", ByteBuffer.wrap(StringUtils.toBytes(i)));
            gr.put("CInt", i);
            gr.put("CLong", Long.valueOf(i));
            gr.put("CFloat", Float.valueOf(i));
            gr.put("CDouble", Double.valueOf(i));
            gr.put("CBoolean", (i & 1) == 1);
            gr.put("CNull", null);
            dw.append(gr);
        }
        return f;
    } finally {
        dw.close();
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 78 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project h2o-3 by h2oai.

the class AvroFileGenerator method generateEnumTypes.

public static File generateEnumTypes(String filename, int nrows, String[][] categories) throws IOException {
    assert categories.length == 2 : "Needs only 2 columns";
    File parentDir = Files.createTempDir();
    File f = new File(parentDir, filename);
    DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
    DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
    Schema enumSchema1 = SchemaBuilder.enumeration("CEnum1").symbols(categories[0]);
    Schema enumSchema2 = SchemaBuilder.enumeration("CEnum2").symbols(categories[1]);
    Schema schema = SchemaBuilder.builder().record("test_enum_types").fields().name("CEnum").type(enumSchema1).noDefault().name("CUEnum").type().optional().type(enumSchema2).endRecord();
    System.out.println(schema);
    int numOfCategories1 = categories[0].length;
    int numOfCategories2 = categories[1].length;
    try {
        dw.create(schema, f);
        for (int i = 0; i < nrows; i++) {
            GenericRecord gr = new GenericData.Record(schema);
            gr.put("CEnum", new GenericData.EnumSymbol(enumSchema1, categories[0][i % numOfCategories1]));
            gr.put("CUEnum", i % (numOfCategories2 + 1) == 0 ? null : new GenericData.EnumSymbol(enumSchema2, categories[1][i % numOfCategories2]));
            dw.append(gr);
        }
        return f;
    } finally {
        dw.close();
        ;
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) GenericData(org.apache.avro.generic.GenericData)

Example 79 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project pinot by linkedin.

the class BaseClusterIntegrationTest method pushRandomAvroIntoKafka.

public static void pushRandomAvroIntoKafka(File avroFile, String kafkaBroker, String kafkaTopic, int rowCount, Random random) {
    Properties properties = new Properties();
    properties.put("metadata.broker.list", kafkaBroker);
    properties.put("serializer.class", "kafka.serializer.DefaultEncoder");
    properties.put("request.required.acks", "1");
    ProducerConfig producerConfig = new ProducerConfig(properties);
    Producer<String, byte[]> producer = new Producer<String, byte[]>(producerConfig);
    try {
        ByteArrayOutputStream outputStream = new ByteArrayOutputStream(65536);
        DataFileStream<GenericRecord> reader = AvroUtils.getAvroReader(avroFile);
        BinaryEncoder binaryEncoder = new EncoderFactory().directBinaryEncoder(outputStream, null);
        Schema avroSchema = reader.getSchema();
        GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(avroSchema);
        int recordCount = 0;
        int rowsRemaining = rowCount;
        int messagesInThisBatch = 0;
        while (rowsRemaining > 0) {
            int rowsInThisBatch = Math.min(rowsRemaining, MAX_MESSAGES_PER_BATCH);
            List<KeyedMessage<String, byte[]>> messagesToWrite = new ArrayList<KeyedMessage<String, byte[]>>(rowsInThisBatch);
            GenericRecord genericRecord = new GenericData.Record(avroSchema);
            for (int i = 0; i < rowsInThisBatch; ++i) {
                generateRandomRecord(genericRecord, avroSchema, random);
                outputStream.reset();
                datumWriter.write(genericRecord, binaryEncoder);
                binaryEncoder.flush();
                byte[] bytes = outputStream.toByteArray();
                KeyedMessage<String, byte[]> data = new KeyedMessage<String, byte[]>(kafkaTopic, bytes);
                if (BATCH_KAFKA_MESSAGES) {
                    messagesToWrite.add(data);
                    messagesInThisBatch++;
                    if (MAX_MESSAGES_PER_BATCH <= messagesInThisBatch) {
                        messagesInThisBatch = 0;
                        producer.send(messagesToWrite);
                        messagesToWrite.clear();
                        Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
                    }
                } else {
                    producer.send(data);
                }
                recordCount += 1;
            }
            if (BATCH_KAFKA_MESSAGES) {
                producer.send(messagesToWrite);
            }
            //        System.out.println("rowsRemaining = " + rowsRemaining);
            rowsRemaining -= rowsInThisBatch;
        }
        outputStream.close();
        reader.close();
        LOGGER.info("Finished writing " + recordCount + " records from " + avroFile.getName() + " into Kafka topic " + kafkaTopic);
        int totalRecordCount = totalAvroRecordWrittenCount.addAndGet(recordCount);
        LOGGER.info("Total records written so far " + totalRecordCount);
    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}
Also used : EncoderFactory(org.apache.avro.io.EncoderFactory) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) Properties(java.util.Properties) JSONException(org.json.JSONException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) SQLException(java.sql.SQLException) IOException(java.io.IOException) Producer(kafka.javaapi.producer.Producer) BinaryEncoder(org.apache.avro.io.BinaryEncoder) ProducerConfig(kafka.producer.ProducerConfig) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) KeyedMessage(kafka.producer.KeyedMessage)

Example 80 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project nifi by apache.

the class TestConvertAvroToORC method test_onTrigger_complex_record.

@Test
public void test_onTrigger_complex_record() throws Exception {
    Map<String, Double> mapData1 = new TreeMap<String, Double>() {

        {
            put("key1", 1.0);
            put("key2", 2.0);
        }
    };
    GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));
    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    // Put another record in
    Map<String, Double> mapData2 = new TreeMap<String, Double>() {

        {
            put("key1", 3.0);
            put("key2", 4.0);
        }
    };
    record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();
    Map<String, String> attributes = new HashMap<String, String>() {

        {
            put(CoreAttributes.FILENAME.key(), "test");
        }
    };
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " + "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)" + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);
    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());
    Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
    assertTrue(mapFieldObject instanceof Map);
    Map map = (Map) mapFieldObject;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
    mapValue = map.get(new Text("key2"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) DoubleWritable(org.apache.hadoop.io.DoubleWritable) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) FileSystem(org.apache.hadoop.fs.FileSystem) GenericRecord(org.apache.avro.generic.GenericRecord) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) DataFileWriter(org.apache.avro.file.DataFileWriter) Text(org.apache.hadoop.io.Text) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TreeMap(java.util.TreeMap) GenericData(org.apache.avro.generic.GenericData) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MockFlowFile(org.apache.nifi.util.MockFlowFile) FileOutputStream(java.io.FileOutputStream) HashMap(java.util.HashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Aggregations

GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)127 GenericRecord (org.apache.avro.generic.GenericRecord)105 Schema (org.apache.avro.Schema)69 ByteArrayOutputStream (java.io.ByteArrayOutputStream)57 DataFileWriter (org.apache.avro.file.DataFileWriter)47 File (java.io.File)40 Test (org.junit.Test)37 IOException (java.io.IOException)29 BinaryEncoder (org.apache.avro.io.BinaryEncoder)29 MockFlowFile (org.apache.nifi.util.MockFlowFile)25 Encoder (org.apache.avro.io.Encoder)23 TestRunner (org.apache.nifi.util.TestRunner)20 HashMap (java.util.HashMap)14 ByteArrayOutputStream (org.apache.nifi.stream.io.ByteArrayOutputStream)14 GenericData (org.apache.avro.generic.GenericData)12 ByteArrayInputStream (java.io.ByteArrayInputStream)11 FileOutputStream (java.io.FileOutputStream)10 InputStream (java.io.InputStream)9 ArrayList (java.util.ArrayList)8 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)8