Search in sources :

Example 11 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project pinot by linkedin.

the class BaseClusterIntegrationTest method pushAvroIntoKafka.

public static void pushAvroIntoKafka(List<File> avroFiles, String kafkaBroker, String kafkaTopic, final byte[] header) {
    Properties properties = new Properties();
    properties.put("metadata.broker.list", kafkaBroker);
    properties.put("serializer.class", "kafka.serializer.DefaultEncoder");
    properties.put("request.required.acks", "1");
    ProducerConfig producerConfig = new ProducerConfig(properties);
    Producer<byte[], byte[]> producer = new Producer<byte[], byte[]>(producerConfig);
    for (File avroFile : avroFiles) {
        try {
            ByteArrayOutputStream outputStream = new ByteArrayOutputStream(65536);
            DataFileStream<GenericRecord> reader = AvroUtils.getAvroReader(avroFile);
            BinaryEncoder binaryEncoder = new EncoderFactory().directBinaryEncoder(outputStream, null);
            GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(reader.getSchema());
            int recordCount = 0;
            List<KeyedMessage<byte[], byte[]>> messagesToWrite = new ArrayList<KeyedMessage<byte[], byte[]>>(10000);
            int messagesInThisBatch = 0;
            for (GenericRecord genericRecord : reader) {
                outputStream.reset();
                if (header != null && 0 < header.length) {
                    outputStream.write(header);
                }
                datumWriter.write(genericRecord, binaryEncoder);
                binaryEncoder.flush();
                byte[] bytes = outputStream.toByteArray();
                KeyedMessage<byte[], byte[]> data = new KeyedMessage<byte[], byte[]>(kafkaTopic, Longs.toByteArray(System.currentTimeMillis()), bytes);
                if (BATCH_KAFKA_MESSAGES) {
                    messagesToWrite.add(data);
                    messagesInThisBatch++;
                    if (MAX_MESSAGES_PER_BATCH <= messagesInThisBatch) {
                        LOGGER.debug("Sending a batch of {} records to Kafka", messagesInThisBatch);
                        messagesInThisBatch = 0;
                        producer.send(messagesToWrite);
                        messagesToWrite.clear();
                    }
                } else {
                    producer.send(data);
                }
                recordCount += 1;
            }
            if (BATCH_KAFKA_MESSAGES) {
                LOGGER.info("Sending last match of {} records to Kafka", messagesToWrite.size());
                producer.send(messagesToWrite);
            }
            outputStream.close();
            reader.close();
            LOGGER.info("Finished writing " + recordCount + " records from " + avroFile.getName() + " into Kafka topic " + kafkaTopic + " from file " + avroFile.getName());
            int totalRecordCount = totalAvroRecordWrittenCount.addAndGet(recordCount);
            LOGGER.info("Total records written so far " + totalRecordCount);
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }
}
Also used : EncoderFactory(org.apache.avro.io.EncoderFactory) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) Properties(java.util.Properties) JSONException(org.json.JSONException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) SQLException(java.sql.SQLException) IOException(java.io.IOException) Producer(kafka.javaapi.producer.Producer) BinaryEncoder(org.apache.avro.io.BinaryEncoder) ProducerConfig(kafka.producer.ProducerConfig) GenericRecord(org.apache.avro.generic.GenericRecord) KeyedMessage(kafka.producer.KeyedMessage) File(java.io.File)

Example 12 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project pinot by linkedin.

the class UploadRefreshDeleteIntegrationTest method generateAndUploadRandomSegment1.

protected void generateAndUploadRandomSegment1(final String segmentName, int rowCount) throws Exception {
    ThreadLocalRandom random = ThreadLocalRandom.current();
    Schema schema = new Schema.Parser().parse(new File(TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource("dummy.avsc"))));
    GenericRecord record = new GenericData.Record(schema);
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    final File avroFile = new File(_tmpDir, segmentName + ".avro");
    fileWriter.create(schema, avroFile);
    for (int i = 0; i < rowCount; i++) {
        record.put(0, random.nextInt());
        fileWriter.append(record);
    }
    fileWriter.close();
    final int segmentIndex = Integer.parseInt(segmentName.split("_")[1]);
    final String TAR_GZ_FILE_EXTENTION = ".tar.gz";
    File segmentTarDir = new File(_tarsDir, segmentName);
    buildSegment(segmentTarDir, avroFile, segmentIndex, segmentName, 0);
    String segmentFileName = segmentName;
    for (String name : segmentTarDir.list()) {
        if (name.endsWith(TAR_GZ_FILE_EXTENTION)) {
            segmentFileName = name;
        }
    }
    File file = new File(segmentTarDir, segmentFileName);
    long segmentLength = file.length();
    final File segmentTarDir1 = new File(_tarsDir, segmentName);
    FileUtils.deleteQuietly(segmentTarDir);
    new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                buildSegment(segmentTarDir1, avroFile, segmentIndex, segmentName, 5);
            } catch (Exception e) {
            }
        }
    }).start();
    FileUploadUtils.sendSegmentFile("localhost", "8998", segmentFileName, file, segmentLength, 5, 5);
    avroFile.delete();
    FileUtils.deleteQuietly(segmentTarDir);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 13 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project pinot by linkedin.

the class UploadRefreshDeleteIntegrationTest method generateAndUploadRandomSegment.

protected void generateAndUploadRandomSegment(String segmentName, int rowCount) throws Exception {
    ThreadLocalRandom random = ThreadLocalRandom.current();
    Schema schema = new Schema.Parser().parse(new File(TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource("dummy.avsc"))));
    GenericRecord record = new GenericData.Record(schema);
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    File avroFile = new File(_tmpDir, segmentName + ".avro");
    fileWriter.create(schema, avroFile);
    for (int i = 0; i < rowCount; i++) {
        record.put(0, random.nextInt());
        fileWriter.append(record);
    }
    fileWriter.close();
    int segmentIndex = Integer.parseInt(segmentName.split("_")[1]);
    File segmentTarDir = new File(_tarsDir, segmentName);
    ensureDirectoryExistsAndIsEmpty(segmentTarDir);
    ExecutorService executor = MoreExecutors.sameThreadExecutor();
    buildSegmentsFromAvro(Collections.singletonList(avroFile), executor, segmentIndex, new File(_segmentsDir, segmentName), segmentTarDir, this.tableName, false, null);
    executor.shutdown();
    executor.awaitTermination(1L, TimeUnit.MINUTES);
    for (String segmentFileName : segmentTarDir.list()) {
        File file = new File(segmentTarDir, segmentFileName);
        FileUploadUtils.sendSegmentFile("localhost", "8998", segmentFileName, file, file.length());
    }
    avroFile.delete();
    FileUtils.deleteQuietly(segmentTarDir);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ExecutorService(java.util.concurrent.ExecutorService) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 14 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project beam by apache.

the class FakeJobService method writeRowsHelper.

private void writeRowsHelper(List<TableRow> rows, Schema avroSchema, String destinationPattern, int shard) throws IOException {
    String filename = destinationPattern.replace("*", String.format("%012d", shard));
    try (WritableByteChannel channel = FileSystems.create(FileSystems.matchNewResource(filename, false), MimeTypes.BINARY);
        DataFileWriter<GenericRecord> tableRowWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(avroSchema)).create(avroSchema, Channels.newOutputStream(channel))) {
        for (Map<String, Object> record : rows) {
            GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(avroSchema);
            for (Map.Entry<String, Object> field : record.entrySet()) {
                genericRecordBuilder.set(field.getKey(), field.getValue());
            }
            tableRowWriter.append(genericRecordBuilder.build());
        }
    } catch (IOException e) {
        throw new IllegalStateException(String.format("Could not create destination for extract job %s", filename), e);
    }
}
Also used : WritableByteChannel(java.nio.channels.WritableByteChannel) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) IOException(java.io.IOException) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) Map(java.util.Map)

Example 15 with GenericDatumWriter

use of org.apache.avro.generic.GenericDatumWriter in project beam by apache.

the class AvroPipelineTest method populateGenericFile.

private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
    FileOutputStream outputStream = new FileOutputStream(this.inputFile);
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema);
    try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter)) {
        dataFileWriter.create(schema, outputStream);
        for (GenericRecord record : genericRecords) {
            dataFileWriter.append(record);
        }
    }
    outputStream.close();
}
Also used : FileOutputStream(java.io.FileOutputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)49 GenericRecord (org.apache.avro.generic.GenericRecord)46 ByteArrayOutputStream (java.io.ByteArrayOutputStream)24 Schema (org.apache.avro.Schema)23 DataFileWriter (org.apache.avro.file.DataFileWriter)17 BinaryEncoder (org.apache.avro.io.BinaryEncoder)17 IOException (java.io.IOException)13 Encoder (org.apache.avro.io.Encoder)12 File (java.io.File)9 Test (org.junit.Test)6 FileOutputStream (java.io.FileOutputStream)4 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)4 ArrayList (java.util.ArrayList)3 Properties (java.util.Properties)3 Producer (kafka.javaapi.producer.Producer)3 ProducerConfig (kafka.producer.ProducerConfig)3 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)3 JsonEncoder (org.apache.avro.io.JsonEncoder)3 AvroAdapter (com.linkedin.data.avro.AvroAdapter)2 DbusEventInfo (com.linkedin.databus.core.DbusEventInfo)2