Search in sources :

Example 6 with GenericDatumReader

use of org.apache.avro.generic.GenericDatumReader in project druid by druid-io.

the class AvroHadoopInputRowParserTest method buildPigAvro.

private static GenericRecord buildPigAvro(GenericRecord datum, String inputStorage, String outputStorage) throws IOException {
    final File tmpDir = Files.createTempDir();
    FileReader<GenericRecord> reader = null;
    PigServer pigServer = null;
    try {
        // 0. write avro object into temp file.
        File someAvroDatumFile = new File(tmpDir, "someAvroDatum.avro");
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>());
        dataFileWriter.create(SomeAvroDatum.getClassSchema(), someAvroDatumFile);
        dataFileWriter.append(datum);
        dataFileWriter.close();
        // 1. read avro files into Pig
        pigServer = new PigServer(ExecType.LOCAL);
        pigServer.registerQuery(String.format("A = LOAD '%s' USING %s;", someAvroDatumFile, inputStorage));
        // 2. write new avro file using AvroStorage
        File outputDir = new File(tmpDir, "output");
        pigServer.store("A", String.valueOf(outputDir), outputStorage);
        // 3. read avro object from AvroStorage
        reader = DataFileReader.openReader(new File(outputDir, "part-m-00000.avro"), new GenericDatumReader<GenericRecord>());
        return reader.next();
    } finally {
        if (pigServer != null) {
            pigServer.shutdown();
        }
        Closeables.close(reader, true);
        FileUtils.deleteDirectory(tmpDir);
    }
}
Also used : PigServer(org.apache.pig.PigServer) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 7 with GenericDatumReader

use of org.apache.avro.generic.GenericDatumReader in project pinot by linkedin.

the class BaseClusterIntegrationTest method createH2SchemaAndInsertAvroFiles.

public static void createH2SchemaAndInsertAvroFiles(List<File> avroFiles, Connection connection) {
    try {
        connection.prepareCall("DROP TABLE IF EXISTS mytable");
        File schemaAvroFile = avroFiles.get(0);
        DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
        DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(schemaAvroFile, datumReader);
        Schema schema = dataFileReader.getSchema();
        List<Schema.Field> fields = schema.getFields();
        List<String> columnNamesAndTypes = new ArrayList<String>(fields.size());
        int columnCount = 0;
        for (Schema.Field field : fields) {
            String fieldName = field.name();
            Schema.Type fieldType = field.schema().getType();
            switch(fieldType) {
                case UNION:
                    List<Schema> types = field.schema().getTypes();
                    String columnNameAndType;
                    String typeName = types.get(0).getName();
                    if (typeName.equalsIgnoreCase("int")) {
                        typeName = "bigint";
                    }
                    if (types.size() == 1) {
                        columnNameAndType = fieldName + " " + typeName + " not null";
                    } else {
                        columnNameAndType = fieldName + " " + typeName;
                    }
                    columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
                    ++columnCount;
                    break;
                case ARRAY:
                    String elementTypeName = field.schema().getElementType().getName();
                    if (elementTypeName.equalsIgnoreCase("int")) {
                        elementTypeName = "bigint";
                    }
                    elementTypeName = elementTypeName.replace("string", "varchar(128)");
                    for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
                        columnNamesAndTypes.add(fieldName + "__MV" + i + " " + elementTypeName);
                    }
                    ++columnCount;
                    break;
                case BOOLEAN:
                case INT:
                case LONG:
                case FLOAT:
                case DOUBLE:
                case STRING:
                    String fieldTypeName = fieldType.getName();
                    if (fieldTypeName.equalsIgnoreCase("int")) {
                        fieldTypeName = "bigint";
                    }
                    columnNameAndType = fieldName + " " + fieldTypeName + " not null";
                    columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
                    ++columnCount;
                    break;
                case RECORD:
                    // Ignore records
                    continue;
                default:
                    // Ignore other avro types
                    LOGGER.warn("Ignoring field {} of type {}", fieldName, field.schema());
            }
        }
        connection.prepareCall("create table mytable(" + StringUtil.join(",", columnNamesAndTypes.toArray(new String[columnNamesAndTypes.size()])) + ")").execute();
        long start = System.currentTimeMillis();
        StringBuilder params = new StringBuilder("?");
        for (int i = 0; i < columnNamesAndTypes.size() - 1; i++) {
            params.append(",?");
        }
        PreparedStatement statement = connection.prepareStatement("INSERT INTO mytable VALUES (" + params.toString() + ")");
        dataFileReader.close();
        for (File avroFile : avroFiles) {
            datumReader = new GenericDatumReader<GenericRecord>();
            dataFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
            GenericRecord record = null;
            while (dataFileReader.hasNext()) {
                record = dataFileReader.next(record);
                int jdbcIndex = 1;
                for (int avroIndex = 0; avroIndex < columnCount; ++avroIndex) {
                    Object value = record.get(avroIndex);
                    if (value instanceof GenericData.Array) {
                        GenericData.Array array = (GenericData.Array) value;
                        for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
                            if (i < array.size()) {
                                value = array.get(i);
                                if (value instanceof Utf8) {
                                    value = value.toString();
                                }
                            } else {
                                value = null;
                            }
                            statement.setObject(jdbcIndex, value);
                            ++jdbcIndex;
                        }
                    } else {
                        if (value instanceof Utf8) {
                            value = value.toString();
                        }
                        statement.setObject(jdbcIndex, value);
                        ++jdbcIndex;
                    }
                }
                statement.execute();
            }
            dataFileReader.close();
        }
        LOGGER.info("Insertion took " + (System.currentTimeMillis() - start));
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) PreparedStatement(java.sql.PreparedStatement) GenericData(org.apache.avro.generic.GenericData) JSONException(org.json.JSONException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) SQLException(java.sql.SQLException) IOException(java.io.IOException) JSONArray(org.json.JSONArray) Utf8(org.apache.avro.util.Utf8) JSONObject(org.json.JSONObject) File(java.io.File)

Example 8 with GenericDatumReader

use of org.apache.avro.generic.GenericDatumReader in project pinot by linkedin.

the class QueryGenerator method addAvroData.

/**
   * Helper method to read in an Avro file and add data to the storage.
   *
   * @param avroFile Avro file.
   */
private void addAvroData(File avroFile) {
    // Read in records and update the values stored.
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroFile, datumReader)) {
        for (GenericRecord genericRecord : fileReader) {
            for (String columnName : _columnNames) {
                Set<String> values = _columnToValueSet.get(columnName);
                // Turn the Avro value into a valid SQL String token.
                Object avroValue = genericRecord.get(columnName);
                if (avroValue != null) {
                    Integer storedMaxNumElements = _multiValueColumnMaxNumElements.get(columnName);
                    if (storedMaxNumElements != null) {
                        // Multi-value column
                        GenericData.Array array = (GenericData.Array) avroValue;
                        int numElements = array.size();
                        if (storedMaxNumElements < numElements) {
                            _multiValueColumnMaxNumElements.put(columnName, numElements);
                        }
                        for (Object element : array) {
                            storeAvroValueIntoValueSet(values, element);
                        }
                    } else {
                        // Single-value column
                        storeAvroValueIntoValueSet(values, avroValue);
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericData(org.apache.avro.generic.GenericData) JSONArray(org.json.JSONArray) DataFileReader(org.apache.avro.file.DataFileReader) JSONObject(org.json.JSONObject) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 9 with GenericDatumReader

use of org.apache.avro.generic.GenericDatumReader in project databus by linkedin.

the class BootstrapAvroFileEventReader method readEventsFromHadoopFiles.

private EventReaderSummary readEventsFromHadoopFiles(OracleTriggerMonitoredSourceInfo sourceInfo, File avroSeedDir, Long windowSCN) {
    DataFileReader<GenericRecord> reader = null;
    File[] files = avroSeedDir.listFiles();
    List<File> fileList = Arrays.asList(files);
    Collections.sort(fileList);
    long numRead = 0;
    long prevNumRead = 0;
    long numBytes = 0;
    long timestamp = System.currentTimeMillis();
    long timeStart = timestamp;
    long lastTime = timestamp;
    long commitInterval = _config.getCommitInterval();
    long totLatency = 0;
    GenericRecord record = null;
    RateMonitor seedingRate = new RateMonitor("Seeding Rate");
    seedingRate.start();
    seedingRate.suspend();
    long startRowId = _lastRows.get(sourceInfo.getEventView());
    LOG.info("Last Known Row Id is :" + startRowId);
    boolean resumeSeedingRate = true;
    for (File avroSeedFile : files) {
        if (!avroSeedFile.isFile())
            continue;
        LOG.info("Seeding from File : " + avroSeedFile);
        try {
            reader = new DataFileReader<GenericRecord>(avroSeedFile, new GenericDatumReader<GenericRecord>());
        } catch (IOException e) {
            LOG.fatal("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
            throw new RuntimeException("Failed to bootstrap from file " + avroSeedFile.getAbsolutePath(), e);
        }
        try {
            boolean committed = false;
            for (GenericRecord hdfsRecord : reader) {
                record = hdfsRecord;
                committed = false;
                numRead++;
                if (numRead < startRowId)
                    continue;
                if (resumeSeedingRate) {
                    seedingRate.resume();
                    resumeSeedingRate = false;
                }
                seedingRate.tick();
                //LOG.info("Read record :" + record);	    			
                long start = System.nanoTime();
                long eventSize = sourceInfo.getFactory().createAndAppendEvent(windowSCN, timestamp, hdfsRecord, _bootstrapEventBuffer, false, null);
                numBytes += eventSize;
                long latency = System.nanoTime() - start;
                totLatency += latency;
                if (numRead % commitInterval == 0) {
                    _bootstrapEventBuffer.endEvents(numRead, timestamp, null);
                    _bootstrapEventBuffer.startEvents();
                    long procTime = totLatency / 1000000000;
                    long currTime = System.currentTimeMillis();
                    long diff = (currTime - lastTime) / 1000;
                    long timeSinceStart = (currTime - timeStart) / 1000;
                    LOG.info("Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
                    lastTime = currTime;
                    seedingRate.resume();
                    committed = true;
                }
            }
            if (!committed) {
                _bootstrapEventBuffer.endEvents(numRead, timestamp, null);
                _bootstrapEventBuffer.startEvents();
                long procTime = totLatency / 1000000000;
                long currTime = System.currentTimeMillis();
                long diff = (currTime - lastTime) / 1000;
                long timeSinceStart = (currTime - timeStart) / 1000;
                LOG.info("Completed Seeding from : " + avroSeedFile + ", Processed " + commitInterval + " rows in " + diff + " seconds, Avro Processing Time (seconds) so far :" + (procTime) + ",Seconds elapsed since start :" + (timeSinceStart) + ",Overall Row Rate:" + seedingRate.getRate() + ", NumRows Fetched so far:" + numRead + ". TotalEventSize :" + numBytes);
                lastTime = currTime;
                seedingRate.resume();
            }
        } catch (Exception e) {
            LOG.fatal("NumRead :" + numRead + ", Got Exception while processing generic record :" + record, e);
            throw new RuntimeException(e);
        }
        LOG.info("Processed " + (numRead - prevNumRead) + " rows of Source: " + sourceInfo.getSourceName() + " from file " + avroSeedFile);
        prevNumRead = numRead;
    }
    long timeEnd = System.currentTimeMillis();
    long elapsedMin = (timeEnd - timeStart) / (MILLISEC_TO_MIN);
    LOG.info("Processed " + numRead + " rows of Source: " + sourceInfo.getSourceName() + " in " + elapsedMin + " minutes");
    return new EventReaderSummary(sourceInfo.getSourceId(), sourceInfo.getSourceName(), -1, (int) numRead, numBytes, (timeEnd - timeStart), (timeEnd - timeStart) / numRead, 0, 0, 0);
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) IOException(java.io.IOException) RateMonitor(com.linkedin.databus.core.util.RateMonitor) EventCreationException(com.linkedin.databus2.producers.EventCreationException) DatabusException(com.linkedin.databus2.core.DatabusException) InvalidConfigException(com.linkedin.databus.core.util.InvalidConfigException) IOException(java.io.IOException) UnsupportedKeyException(com.linkedin.databus.core.UnsupportedKeyException) EventReaderSummary(com.linkedin.databus2.producers.db.EventReaderSummary) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 10 with GenericDatumReader

use of org.apache.avro.generic.GenericDatumReader in project databus by linkedin.

the class DbusEventAvroDecoder method getGenericRecord.

/**
   * Creates a generic record from a byte array.
   *
   * @param valueBytes  byte[] to be converted to generic record
   * @param schema      schema of the input record
   * @return GenericRecord for the given byte array + schema combo
   *
   * TODO:  Add a   getGenericRecord(InputStream data, Schema schema, GenericRecord reuse)
   *        variant; it can use DecoderFactory.createBinaryDecoder(InputStream, BinaryDecorder)
   *        and will allow us to use something like org.apache.avro.ipc.ByteBufferInputStream
   *        to avoid the data copy to a temp array.  (https://rb.corp.linkedin.com/r/172879/)
   */
public GenericRecord getGenericRecord(byte[] valueBytes, Schema schema, GenericRecord reuse) {
    GenericRecord result = null;
    try {
        binDecoder.set(DecoderFactory.defaultFactory().createBinaryDecoder(valueBytes, binDecoder.get()));
        GenericDatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
        result = reader.read(reuse, binDecoder.get());
        return result;
    } catch (// IOException, ArrayIndexOutOfBoundsException, ...
    Exception ex) {
        LOG.error("getGenericRecord Avro error: " + ex.getMessage(), ex);
    }
    return result;
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) DatabusRuntimeException(com.linkedin.databus.core.DatabusRuntimeException) IOException(java.io.IOException) JsonGenerationException(org.codehaus.jackson.JsonGenerationException) BufferUnderflowException(java.nio.BufferUnderflowException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Aggregations

GenericDatumReader (org.apache.avro.generic.GenericDatumReader)46 GenericRecord (org.apache.avro.generic.GenericRecord)31 Schema (org.apache.avro.Schema)20 IOException (java.io.IOException)15 File (java.io.File)10 DataFileStream (org.apache.avro.file.DataFileStream)10 Decoder (org.apache.avro.io.Decoder)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 GenericData (org.apache.avro.generic.GenericData)7 DataFileReader (org.apache.avro.file.DataFileReader)6 Test (org.junit.Test)6 ArrayList (java.util.ArrayList)5 JsonDecoder (org.apache.avro.io.JsonDecoder)5 ParseException (io.druid.java.util.common.parsers.ParseException)4 FileInputStream (java.io.FileInputStream)4 DataFileWriter (org.apache.avro.file.DataFileWriter)4 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)3 Map (java.util.Map)3 ChannelBufferInputStream (org.jboss.netty.buffer.ChannelBufferInputStream)3