Search in sources :

Example 71 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project flink by apache.

the class RollingSinkITCase method testNonRollingAvroKeyValueWithCompressionWriter.

/**
	 * This tests {@link AvroKeyValueSinkWriter}
	 * with non-rolling output and with compression.
	 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
    Map<String, String> properties = new HashMap<>();
    Schema keySchema = Schema.create(Type.INT);
    Schema valueSchema = Schema.create(Type.STRING);
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
    RollingSink<Tuple2<Integer, String>> sink = new RollingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
    source.addSink(sink);
    env.execute("RollingSink Avro KeyValue Writer Test");
    GenericData.setStringType(valueSchema, StringType.String);
    Schema elementSchema = AvroKeyValue.getSchema(keySchema, valueSchema);
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
    SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<GenericRecord>(elementSchema);
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 0; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
    inStream = dfs.open(new Path(outPath + "/part-1-0"));
    dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 1; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) DataFileStream(org.apache.avro.file.DataFileStream) AvroKeyValue(org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter.AvroKeyValue) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 72 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project flink by apache.

the class AvroRecordInputFormatTest method testDeserialisationGenericRecordReuseAvroValueFalse.

/**
	 * Test if the AvroInputFormat is able to properly read data from an avro
	 * file as a GenericRecord
	 * 
	 * @throws IOException,
	 *             if there is an error
	 */
@Test
public void testDeserialisationGenericRecordReuseAvroValueFalse() throws IOException {
    Configuration parameters = new Configuration();
    AvroInputFormat<GenericRecord> format = new AvroInputFormat<GenericRecord>(new Path(testFile.getAbsolutePath()), GenericRecord.class);
    format.configure(parameters);
    format.setReuseAvroValue(false);
    doTestDeserializationGenericRecord(format, parameters);
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.flink.configuration.Configuration) AvroInputFormat(org.apache.flink.api.java.io.AvroInputFormat) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 73 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project databus by linkedin.

the class BootstrapAvroFileEventReader method readEventsFromHadoopFiles.

private EventReaderSummary readEventsFromHadoopFiles(OracleTriggerMonitoredSourceInfo sourceInfo, File avroSeedDir, Long windowSCN) {
    DataFileReader<GenericRecord> reader = null;
    File[] files = avroSeedDir.listFiles();
    List<File> fileList = Arrays.asList(files);
    Collections.sort(fileList);
    long numRead = 0;
    long prevNumRead = 0;
    long numBytes = 0;
    long timestamp = System.currentTimeMillis();
    long timeStart = timestamp;
    long lastTime = timestamp;
    long commitInterval = _config.getCommitInterval();
    long totLatency = 0;
    GenericRecord record = null;
    RateMonitor seedingRate = new RateMonitor("Seeding Rate");
    seedingRate.start();
    seedingRate.suspend();
    long startRowId = _lastRows.get(sourceInfo.getEventView());
    LOG.info("Last Known Row Id is :" + startRowId);
    boolean resumeSeedingRate = true;
    for (File avroSeedFile : files) {
        if (!avroSeedFile.isFile())
            continue;
        LOG.info("Seeding from File : " + avroSeedFile);
        try {
            reader = new DataFileReader<GenericRecord>(avroSeedFile, new GenericDatumReader<GenericRecord>());
        } catch (IOException e) {
            LOG.fatal("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
            throw new RuntimeException("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
        }
        try {
            boolean committed = false;
            for (GenericRecord hdfsRecord : reader) {
                record = hdfsRecord;
                committed = false;
                numRead++;
                if (numRead < startRowId)
                    continue;
                if (resumeSeedingRate) {
                    seedingRate.resume();
                    resumeSeedingRate = false;
                }
                seedingRate.tick();
                //LOG.info("Read record :" + record);	    			
                long start = System.nanoTime();
                long eventSize = sourceInfo.getFactory().createAndAppendEvent(windowSCN, timestamp, hdfsRecord, _bootstrapEventBuffer, false, null);
                numBytes += eventSize;
                long latency = System.nanoTime() - start;
                totLatency += latency;
                if (numRead % commitInterval == 0) {
                    _bootstrapEventBuffer.endEvents(numRead, timestamp, null);
                    _bootstrapEventBuffer.startEvents();
                    long procTime = totLatency / 1000000000;
                    long currTime = System.currentTimeMillis();
                    long diff = (currTime - lastTime) / 1000;
                    long timeSinceStart = (currTime - timeStart) / 1000;
                    LOG.info("Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
                    lastTime = currTime;
                    seedingRate.resume();
                    committed = true;
                }
            }
            if (!committed) {
                _bootstrapEventBuffer.endEvents(numRead, timestamp, null);
                _bootstrapEventBuffer.startEvents();
                long procTime = totLatency / 1000000000;
                long currTime = System.currentTimeMillis();
                long diff = (currTime - lastTime) / 1000;
                long timeSinceStart = (currTime - timeStart) / 1000;
                LOG.info("Completed Seeding from : " + avroSeedFile + ", Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
                lastTime = currTime;
                seedingRate.resume();
            }
        } catch (Exception e) {
            LOG.fatal("NumRead :" + numRead + ", Got Exception while processing generic record :" + record, e);
            throw new RuntimeException(e);
        }
        LOG.info("Processed " + (numRead - prevNumRead) + " rows of Source: " + sourceInfo.getSourceName() + " from file " + avroSeedFile);
        prevNumRead = numRead;
    }
    long timeEnd = System.currentTimeMillis();
    long elapsedMin = (timeEnd - timeStart) / (MILLISEC_TO_MIN);
    LOG.info("Processed " + numRead + " rows of Source: " + sourceInfo.getSourceName() + " in " + elapsedMin + " minutes");
    return new EventReaderSummary(sourceInfo.getSourceId(), sourceInfo.getSourceName(), -1, (int) numRead, numBytes, (timeEnd - timeStart), (timeEnd - timeStart) / numRead, 0, 0, 0);
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) IOException(java.io.IOException) RateMonitor(com.linkedin.databus.core.util.RateMonitor) EventCreationException(com.linkedin.databus2.producers.EventCreationException) DatabusException(com.linkedin.databus2.core.DatabusException) InvalidConfigException(com.linkedin.databus.core.util.InvalidConfigException) IOException(java.io.IOException) UnsupportedKeyException(com.linkedin.databus.core.UnsupportedKeyException) EventReaderSummary(com.linkedin.databus2.producers.db.EventReaderSummary) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 74 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project databus by linkedin.

the class BootstrapTableReaderV2 method execute.

public void execute() throws SQLException {
    ResultSet rs = null;
    boolean hasMore = true;
    long curId = -1;
    try {
        _log.info("Executing query : " + _queryString);
        ByteBuffer buffer = ByteBuffer.allocateDirect(MAX_EVENT_SIZE);
        int count = 0;
        DbusEventInternalReadable event = _eventFactory.createReadOnlyDbusEventFromBuffer(buffer, 0);
        _eventHandler.onStart(_queryString);
        while (hasMore) {
            _log.debug("currentId=" + curId);
            _query.setLong(1, curId);
            rs = _query.executeQuery();
            hasMore = false;
            while (rs.next()) {
                hasMore = true;
                buffer.clear();
                buffer.put(rs.getBytes("val"));
                curId = rs.getLong("id");
                event = event.reset(buffer, 0);
                GenericRecord record = _decoder.getGenericRecord(event);
                if (checkFilters(event, record)) {
                    _eventHandler.onRecord(event, record);
                }
                count++;
            }
            rs.close();
        }
        _eventHandler.onEnd(count);
    } finally {
        DBHelper.close(rs, _query, _jdbcConn);
    }
}
Also used : DbusEventInternalReadable(com.linkedin.databus.core.DbusEventInternalReadable) ResultSet(java.sql.ResultSet) GenericRecord(org.apache.avro.generic.GenericRecord) ByteBuffer(java.nio.ByteBuffer)

Example 75 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project databus by linkedin.

the class DbusEventAvroDecoder method getGenericRecord.

/**
   * Creates a generic record from a byte array.
   *
   * @param valueBytes  byte[] to be converted to generic record
   * @param schema      schema of the input record
   * @return GenericRecord for the given byte array + schema combo
   *
   * TODO:  Add a   getGenericRecord(InputStream data, Schema schema, GenericRecord reuse)
   *        variant; it can use DecoderFactory.createBinaryDecoder(InputStream, BinaryDecorder)
   *        and will allow us to use something like org.apache.avro.ipc.ByteBufferInputStream
   *        to avoid the data copy to a temp array.  (https://rb.corp.linkedin.com/r/172879/)
   */
public GenericRecord getGenericRecord(byte[] valueBytes, Schema schema, GenericRecord reuse) {
    GenericRecord result = null;
    try {
        binDecoder.set(DecoderFactory.defaultFactory().createBinaryDecoder(valueBytes, binDecoder.get()));
        GenericDatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
        result = reader.read(reuse, binDecoder.get());
        return result;
    } catch (// IOException, ArrayIndexOutOfBoundsException, ...
    Exception ex) {
        LOG.error("getGenericRecord Avro error: " + ex.getMessage(), ex);
    }
    return result;
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) DatabusRuntimeException(com.linkedin.databus.core.DatabusRuntimeException) IOException(java.io.IOException) JsonGenerationException(org.codehaus.jackson.JsonGenerationException) BufferUnderflowException(java.nio.BufferUnderflowException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Aggregations

GenericRecord (org.apache.avro.generic.GenericRecord)262 Schema (org.apache.avro.Schema)101 Test (org.junit.Test)80 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)46 File (java.io.File)35 IOException (java.io.IOException)34 GenericData (org.apache.avro.generic.GenericData)30 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)30 ArrayList (java.util.ArrayList)29 ByteArrayOutputStream (java.io.ByteArrayOutputStream)27 DataFileWriter (org.apache.avro.file.DataFileWriter)20 HashMap (java.util.HashMap)19 ByteBuffer (java.nio.ByteBuffer)18 BinaryEncoder (org.apache.avro.io.BinaryEncoder)17 Field (org.apache.avro.Schema.Field)14 DataFileStream (org.apache.avro.file.DataFileStream)14 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)14 Utf8 (org.apache.avro.util.Utf8)14 Encoder (org.apache.avro.io.Encoder)12 DatasetRepository (com.cloudera.cdk.data.DatasetRepository)11