use of org.apache.avro.generic.GenericDatumReader in project druid by druid-io.
the class AvroHadoopInputRowParserTest method buildPigAvro.
private static GenericRecord buildPigAvro(GenericRecord datum, String inputStorage, String outputStorage) throws IOException {
final File tmpDir = Files.createTempDir();
FileReader<GenericRecord> reader = null;
PigServer pigServer = null;
try {
// 0. write avro object into temp file.
File someAvroDatumFile = new File(tmpDir, "someAvroDatum.avro");
DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>());
dataFileWriter.create(SomeAvroDatum.getClassSchema(), someAvroDatumFile);
dataFileWriter.append(datum);
dataFileWriter.close();
// 1. read avro files into Pig
pigServer = new PigServer(ExecType.LOCAL);
pigServer.registerQuery(String.format("A = LOAD '%s' USING %s;", someAvroDatumFile, inputStorage));
// 2. write new avro file using AvroStorage
File outputDir = new File(tmpDir, "output");
pigServer.store("A", String.valueOf(outputDir), outputStorage);
// 3. read avro object from AvroStorage
reader = DataFileReader.openReader(new File(outputDir, "part-m-00000.avro"), new GenericDatumReader<GenericRecord>());
return reader.next();
} finally {
if (pigServer != null) {
pigServer.shutdown();
}
Closeables.close(reader, true);
FileUtils.deleteDirectory(tmpDir);
}
}
use of org.apache.avro.generic.GenericDatumReader in project pinot by linkedin.
the class BaseClusterIntegrationTest method createH2SchemaAndInsertAvroFiles.
public static void createH2SchemaAndInsertAvroFiles(List<File> avroFiles, Connection connection) {
try {
connection.prepareCall("DROP TABLE IF EXISTS mytable");
File schemaAvroFile = avroFiles.get(0);
DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(schemaAvroFile, datumReader);
Schema schema = dataFileReader.getSchema();
List<Schema.Field> fields = schema.getFields();
List<String> columnNamesAndTypes = new ArrayList<String>(fields.size());
int columnCount = 0;
for (Schema.Field field : fields) {
String fieldName = field.name();
Schema.Type fieldType = field.schema().getType();
switch(fieldType) {
case UNION:
List<Schema> types = field.schema().getTypes();
String columnNameAndType;
String typeName = types.get(0).getName();
if (typeName.equalsIgnoreCase("int")) {
typeName = "bigint";
}
if (types.size() == 1) {
columnNameAndType = fieldName + " " + typeName + " not null";
} else {
columnNameAndType = fieldName + " " + typeName;
}
columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
++columnCount;
break;
case ARRAY:
String elementTypeName = field.schema().getElementType().getName();
if (elementTypeName.equalsIgnoreCase("int")) {
elementTypeName = "bigint";
}
elementTypeName = elementTypeName.replace("string", "varchar(128)");
for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
columnNamesAndTypes.add(fieldName + "__MV" + i + " " + elementTypeName);
}
++columnCount;
break;
case BOOLEAN:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case STRING:
String fieldTypeName = fieldType.getName();
if (fieldTypeName.equalsIgnoreCase("int")) {
fieldTypeName = "bigint";
}
columnNameAndType = fieldName + " " + fieldTypeName + " not null";
columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
++columnCount;
break;
case RECORD:
// Ignore records
continue;
default:
// Ignore other avro types
LOGGER.warn("Ignoring field {} of type {}", fieldName, field.schema());
}
}
connection.prepareCall("create table mytable(" + StringUtil.join(",", columnNamesAndTypes.toArray(new String[columnNamesAndTypes.size()])) + ")").execute();
long start = System.currentTimeMillis();
StringBuilder params = new StringBuilder("?");
for (int i = 0; i < columnNamesAndTypes.size() - 1; i++) {
params.append(",?");
}
PreparedStatement statement = connection.prepareStatement("INSERT INTO mytable VALUES (" + params.toString() + ")");
dataFileReader.close();
for (File avroFile : avroFiles) {
datumReader = new GenericDatumReader<GenericRecord>();
dataFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
GenericRecord record = null;
while (dataFileReader.hasNext()) {
record = dataFileReader.next(record);
int jdbcIndex = 1;
for (int avroIndex = 0; avroIndex < columnCount; ++avroIndex) {
Object value = record.get(avroIndex);
if (value instanceof GenericData.Array) {
GenericData.Array array = (GenericData.Array) value;
for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
if (i < array.size()) {
value = array.get(i);
if (value instanceof Utf8) {
value = value.toString();
}
} else {
value = null;
}
statement.setObject(jdbcIndex, value);
++jdbcIndex;
}
} else {
if (value instanceof Utf8) {
value = value.toString();
}
statement.setObject(jdbcIndex, value);
++jdbcIndex;
}
}
statement.execute();
}
dataFileReader.close();
}
LOGGER.info("Insertion took " + (System.currentTimeMillis() - start));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.avro.generic.GenericDatumReader in project pinot by linkedin.
the class QueryGenerator method addAvroData.
/**
* Helper method to read in an Avro file and add data to the storage.
*
* @param avroFile Avro file.
*/
private void addAvroData(File avroFile) {
// Read in records and update the values stored.
GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroFile, datumReader)) {
for (GenericRecord genericRecord : fileReader) {
for (String columnName : _columnNames) {
Set<String> values = _columnToValueSet.get(columnName);
// Turn the Avro value into a valid SQL String token.
Object avroValue = genericRecord.get(columnName);
if (avroValue != null) {
Integer storedMaxNumElements = _multiValueColumnMaxNumElements.get(columnName);
if (storedMaxNumElements != null) {
// Multi-value column
GenericData.Array array = (GenericData.Array) avroValue;
int numElements = array.size();
if (storedMaxNumElements < numElements) {
_multiValueColumnMaxNumElements.put(columnName, numElements);
}
for (Object element : array) {
storeAvroValueIntoValueSet(values, element);
}
} else {
// Single-value column
storeAvroValueIntoValueSet(values, avroValue);
}
}
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.avro.generic.GenericDatumReader in project databus by linkedin.
the class BootstrapAvroFileEventReader method readEventsFromHadoopFiles.
private EventReaderSummary readEventsFromHadoopFiles(OracleTriggerMonitoredSourceInfo sourceInfo, File avroSeedDir, Long windowSCN) {
DataFileReader<GenericRecord> reader = null;
File[] files = avroSeedDir.listFiles();
List<File> fileList = Arrays.asList(files);
Collections.sort(fileList);
long numRead = 0;
long prevNumRead = 0;
long numBytes = 0;
long timestamp = System.currentTimeMillis();
long timeStart = timestamp;
long lastTime = timestamp;
long commitInterval = _config.getCommitInterval();
long totLatency = 0;
GenericRecord record = null;
RateMonitor seedingRate = new RateMonitor("Seeding Rate");
seedingRate.start();
seedingRate.suspend();
long startRowId = _lastRows.get(sourceInfo.getEventView());
LOG.info("Last Known Row Id is :" + startRowId);
boolean resumeSeedingRate = true;
for (File avroSeedFile : files) {
if (!avroSeedFile.isFile())
continue;
LOG.info("Seeding from File : " + avroSeedFile);
try {
reader = new DataFileReader<GenericRecord>(avroSeedFile, new GenericDatumReader<GenericRecord>());
} catch (IOException e) {
LOG.fatal("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
throw new RuntimeException("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
}
try {
boolean committed = false;
for (GenericRecord hdfsRecord : reader) {
record = hdfsRecord;
committed = false;
numRead++;
if (numRead < startRowId)
continue;
if (resumeSeedingRate) {
seedingRate.resume();
resumeSeedingRate = false;
}
seedingRate.tick();
//LOG.info("Read record :" + record);
long start = System.nanoTime();
long eventSize = sourceInfo.getFactory().createAndAppendEvent(windowSCN, timestamp, hdfsRecord, _bootstrapEventBuffer, false, null);
numBytes += eventSize;
long latency = System.nanoTime() - start;
totLatency += latency;
if (numRead % commitInterval == 0) {
_bootstrapEventBuffer.endEvents(numRead, timestamp, null);
_bootstrapEventBuffer.startEvents();
long procTime = totLatency / 1000000000;
long currTime = System.currentTimeMillis();
long diff = (currTime - lastTime) / 1000;
long timeSinceStart = (currTime - timeStart) / 1000;
LOG.info("Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
lastTime = currTime;
seedingRate.resume();
committed = true;
}
}
if (!committed) {
_bootstrapEventBuffer.endEvents(numRead, timestamp, null);
_bootstrapEventBuffer.startEvents();
long procTime = totLatency / 1000000000;
long currTime = System.currentTimeMillis();
long diff = (currTime - lastTime) / 1000;
long timeSinceStart = (currTime - timeStart) / 1000;
LOG.info("Completed Seeding from : " + avroSeedFile + ", Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
lastTime = currTime;
seedingRate.resume();
}
} catch (Exception e) {
LOG.fatal("NumRead :" + numRead + ", Got Exception while processing generic record :" + record, e);
throw new RuntimeException(e);
}
LOG.info("Processed " + (numRead - prevNumRead) + " rows of Source: " + sourceInfo.getSourceName() + " from file " + avroSeedFile);
prevNumRead = numRead;
}
long timeEnd = System.currentTimeMillis();
long elapsedMin = (timeEnd - timeStart) / (MILLISEC_TO_MIN);
LOG.info("Processed " + numRead + " rows of Source: " + sourceInfo.getSourceName() + " in " + elapsedMin + " minutes");
return new EventReaderSummary(sourceInfo.getSourceId(), sourceInfo.getSourceName(), -1, (int) numRead, numBytes, (timeEnd - timeStart), (timeEnd - timeStart) / numRead, 0, 0, 0);
}
use of org.apache.avro.generic.GenericDatumReader in project databus by linkedin.
the class DbusEventAvroDecoder method getGenericRecord.
/**
* Creates a generic record from a byte array.
*
* @param valueBytes byte[] to be converted to generic record
* @param schema schema of the input record
* @return GenericRecord for the given byte array + schema combo
*
* TODO: Add a getGenericRecord(InputStream data, Schema schema, GenericRecord reuse)
* variant; it can use DecoderFactory.createBinaryDecoder(InputStream, BinaryDecorder)
* and will allow us to use something like org.apache.avro.ipc.ByteBufferInputStream
* to avoid the data copy to a temp array. (https://rb.corp.linkedin.com/r/172879/)
*/
public GenericRecord getGenericRecord(byte[] valueBytes, Schema schema, GenericRecord reuse) {
GenericRecord result = null;
try {
binDecoder.set(DecoderFactory.defaultFactory().createBinaryDecoder(valueBytes, binDecoder.get()));
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
result = reader.read(reuse, binDecoder.get());
return result;
} catch (// IOException, ArrayIndexOutOfBoundsException, ...
Exception ex) {
LOG.error("getGenericRecord Avro error: " + ex.getMessage(), ex);
}
return result;
}
Aggregations