use of org.apache.avro.generic.GenericRecord in project flink by apache.
the class RollingSinkITCase method testNonRollingAvroKeyValueWithCompressionWriter.
/**
* This tests {@link AvroKeyValueSinkWriter}
* with non-rolling output and with compression.
*/
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
final int NUM_ELEMENTS = 20;
final int PARALLELISM = 2;
final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(PARALLELISM);
DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
Map<String, String> properties = new HashMap<>();
Schema keySchema = Schema.create(Type.INT);
Schema valueSchema = Schema.create(Type.STRING);
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
RollingSink<Tuple2<Integer, String>> sink = new RollingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
source.addSink(sink);
env.execute("RollingSink Avro KeyValue Writer Test");
GenericData.setStringType(valueSchema, StringType.String);
Schema elementSchema = AvroKeyValue.getSchema(keySchema, valueSchema);
FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<GenericRecord>(elementSchema);
DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
for (int i = 0; i < NUM_ELEMENTS; i += 2) {
AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
int key = wrappedEntry.getKey().intValue();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
inStream = dfs.open(new Path(outPath + "/part-1-0"));
dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
for (int i = 1; i < NUM_ELEMENTS; i += 2) {
AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
int key = wrappedEntry.getKey().intValue();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
}
use of org.apache.avro.generic.GenericRecord in project flink by apache.
the class AvroRecordInputFormatTest method testDeserialisationGenericRecordReuseAvroValueFalse.
/**
* Test if the AvroInputFormat is able to properly read data from an avro
* file as a GenericRecord
*
* @throws IOException,
* if there is an error
*/
@Test
public void testDeserialisationGenericRecordReuseAvroValueFalse() throws IOException {
Configuration parameters = new Configuration();
AvroInputFormat<GenericRecord> format = new AvroInputFormat<GenericRecord>(new Path(testFile.getAbsolutePath()), GenericRecord.class);
format.configure(parameters);
format.setReuseAvroValue(false);
doTestDeserializationGenericRecord(format, parameters);
}
use of org.apache.avro.generic.GenericRecord in project databus by linkedin.
the class BootstrapAvroFileEventReader method readEventsFromHadoopFiles.
private EventReaderSummary readEventsFromHadoopFiles(OracleTriggerMonitoredSourceInfo sourceInfo, File avroSeedDir, Long windowSCN) {
DataFileReader<GenericRecord> reader = null;
File[] files = avroSeedDir.listFiles();
List<File> fileList = Arrays.asList(files);
Collections.sort(fileList);
long numRead = 0;
long prevNumRead = 0;
long numBytes = 0;
long timestamp = System.currentTimeMillis();
long timeStart = timestamp;
long lastTime = timestamp;
long commitInterval = _config.getCommitInterval();
long totLatency = 0;
GenericRecord record = null;
RateMonitor seedingRate = new RateMonitor("Seeding Rate");
seedingRate.start();
seedingRate.suspend();
long startRowId = _lastRows.get(sourceInfo.getEventView());
LOG.info("Last Known Row Id is :" + startRowId);
boolean resumeSeedingRate = true;
for (File avroSeedFile : files) {
if (!avroSeedFile.isFile())
continue;
LOG.info("Seeding from File : " + avroSeedFile);
try {
reader = new DataFileReader<GenericRecord>(avroSeedFile, new GenericDatumReader<GenericRecord>());
} catch (IOException e) {
LOG.fatal("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
throw new RuntimeException("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
}
try {
boolean committed = false;
for (GenericRecord hdfsRecord : reader) {
record = hdfsRecord;
committed = false;
numRead++;
if (numRead < startRowId)
continue;
if (resumeSeedingRate) {
seedingRate.resume();
resumeSeedingRate = false;
}
seedingRate.tick();
//LOG.info("Read record :" + record);
long start = System.nanoTime();
long eventSize = sourceInfo.getFactory().createAndAppendEvent(windowSCN, timestamp, hdfsRecord, _bootstrapEventBuffer, false, null);
numBytes += eventSize;
long latency = System.nanoTime() - start;
totLatency += latency;
if (numRead % commitInterval == 0) {
_bootstrapEventBuffer.endEvents(numRead, timestamp, null);
_bootstrapEventBuffer.startEvents();
long procTime = totLatency / 1000000000;
long currTime = System.currentTimeMillis();
long diff = (currTime - lastTime) / 1000;
long timeSinceStart = (currTime - timeStart) / 1000;
LOG.info("Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
lastTime = currTime;
seedingRate.resume();
committed = true;
}
}
if (!committed) {
_bootstrapEventBuffer.endEvents(numRead, timestamp, null);
_bootstrapEventBuffer.startEvents();
long procTime = totLatency / 1000000000;
long currTime = System.currentTimeMillis();
long diff = (currTime - lastTime) / 1000;
long timeSinceStart = (currTime - timeStart) / 1000;
LOG.info("Completed Seeding from : " + avroSeedFile + ", Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
lastTime = currTime;
seedingRate.resume();
}
} catch (Exception e) {
LOG.fatal("NumRead :" + numRead + ", Got Exception while processing generic record :" + record, e);
throw new RuntimeException(e);
}
LOG.info("Processed " + (numRead - prevNumRead) + " rows of Source: " + sourceInfo.getSourceName() + " from file " + avroSeedFile);
prevNumRead = numRead;
}
long timeEnd = System.currentTimeMillis();
long elapsedMin = (timeEnd - timeStart) / (MILLISEC_TO_MIN);
LOG.info("Processed " + numRead + " rows of Source: " + sourceInfo.getSourceName() + " in " + elapsedMin + " minutes");
return new EventReaderSummary(sourceInfo.getSourceId(), sourceInfo.getSourceName(), -1, (int) numRead, numBytes, (timeEnd - timeStart), (timeEnd - timeStart) / numRead, 0, 0, 0);
}
use of org.apache.avro.generic.GenericRecord in project databus by linkedin.
the class BootstrapTableReaderV2 method execute.
public void execute() throws SQLException {
ResultSet rs = null;
boolean hasMore = true;
long curId = -1;
try {
_log.info("Executing query : " + _queryString);
ByteBuffer buffer = ByteBuffer.allocateDirect(MAX_EVENT_SIZE);
int count = 0;
DbusEventInternalReadable event = _eventFactory.createReadOnlyDbusEventFromBuffer(buffer, 0);
_eventHandler.onStart(_queryString);
while (hasMore) {
_log.debug("currentId=" + curId);
_query.setLong(1, curId);
rs = _query.executeQuery();
hasMore = false;
while (rs.next()) {
hasMore = true;
buffer.clear();
buffer.put(rs.getBytes("val"));
curId = rs.getLong("id");
event = event.reset(buffer, 0);
GenericRecord record = _decoder.getGenericRecord(event);
if (checkFilters(event, record)) {
_eventHandler.onRecord(event, record);
}
count++;
}
rs.close();
}
_eventHandler.onEnd(count);
} finally {
DBHelper.close(rs, _query, _jdbcConn);
}
}
use of org.apache.avro.generic.GenericRecord in project databus by linkedin.
the class DbusEventAvroDecoder method getGenericRecord.
/**
* Creates a generic record from a byte array.
*
* @param valueBytes byte[] to be converted to generic record
* @param schema schema of the input record
* @return GenericRecord for the given byte array + schema combo
*
* TODO: Add a getGenericRecord(InputStream data, Schema schema, GenericRecord reuse)
* variant; it can use DecoderFactory.createBinaryDecoder(InputStream, BinaryDecorder)
* and will allow us to use something like org.apache.avro.ipc.ByteBufferInputStream
* to avoid the data copy to a temp array. (https://rb.corp.linkedin.com/r/172879/)
*/
public GenericRecord getGenericRecord(byte[] valueBytes, Schema schema, GenericRecord reuse) {
GenericRecord result = null;
try {
binDecoder.set(DecoderFactory.defaultFactory().createBinaryDecoder(valueBytes, binDecoder.get()));
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
result = reader.read(reuse, binDecoder.get());
return result;
} catch (// IOException, ArrayIndexOutOfBoundsException, ...
Exception ex) {
LOG.error("getGenericRecord Avro error: " + ex.getMessage(), ex);
}
return result;
}
Aggregations