Search in sources :

Example 1 with DataFileReader

use of org.apache.avro.file.DataFileReader in project pinot by linkedin.

the class BaseClusterIntegrationTest method createH2SchemaAndInsertAvroFiles.

public static void createH2SchemaAndInsertAvroFiles(List<File> avroFiles, Connection connection) {
    try {
        connection.prepareCall("DROP TABLE IF EXISTS mytable");
        File schemaAvroFile = avroFiles.get(0);
        DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
        DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(schemaAvroFile, datumReader);
        Schema schema = dataFileReader.getSchema();
        List<Schema.Field> fields = schema.getFields();
        List<String> columnNamesAndTypes = new ArrayList<String>(fields.size());
        int columnCount = 0;
        for (Schema.Field field : fields) {
            String fieldName = field.name();
            Schema.Type fieldType = field.schema().getType();
            switch(fieldType) {
                case UNION:
                    List<Schema> types = field.schema().getTypes();
                    String columnNameAndType;
                    String typeName = types.get(0).getName();
                    if (typeName.equalsIgnoreCase("int")) {
                        typeName = "bigint";
                    }
                    if (types.size() == 1) {
                        columnNameAndType = fieldName + " " + typeName + " not null";
                    } else {
                        columnNameAndType = fieldName + " " + typeName;
                    }
                    columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
                    ++columnCount;
                    break;
                case ARRAY:
                    String elementTypeName = field.schema().getElementType().getName();
                    if (elementTypeName.equalsIgnoreCase("int")) {
                        elementTypeName = "bigint";
                    }
                    elementTypeName = elementTypeName.replace("string", "varchar(128)");
                    for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
                        columnNamesAndTypes.add(fieldName + "__MV" + i + " " + elementTypeName);
                    }
                    ++columnCount;
                    break;
                case BOOLEAN:
                case INT:
                case LONG:
                case FLOAT:
                case DOUBLE:
                case STRING:
                    String fieldTypeName = fieldType.getName();
                    if (fieldTypeName.equalsIgnoreCase("int")) {
                        fieldTypeName = "bigint";
                    }
                    columnNameAndType = fieldName + " " + fieldTypeName + " not null";
                    columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
                    ++columnCount;
                    break;
                case RECORD:
                    // Ignore records
                    continue;
                default:
                    // Ignore other avro types
                    LOGGER.warn("Ignoring field {} of type {}", fieldName, field.schema());
            }
        }
        connection.prepareCall("create table mytable(" + StringUtil.join(",", columnNamesAndTypes.toArray(new String[columnNamesAndTypes.size()])) + ")").execute();
        long start = System.currentTimeMillis();
        StringBuilder params = new StringBuilder("?");
        for (int i = 0; i < columnNamesAndTypes.size() - 1; i++) {
            params.append(",?");
        }
        PreparedStatement statement = connection.prepareStatement("INSERT INTO mytable VALUES (" + params.toString() + ")");
        dataFileReader.close();
        for (File avroFile : avroFiles) {
            datumReader = new GenericDatumReader<GenericRecord>();
            dataFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
            GenericRecord record = null;
            while (dataFileReader.hasNext()) {
                record = dataFileReader.next(record);
                int jdbcIndex = 1;
                for (int avroIndex = 0; avroIndex < columnCount; ++avroIndex) {
                    Object value = record.get(avroIndex);
                    if (value instanceof GenericData.Array) {
                        GenericData.Array array = (GenericData.Array) value;
                        for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
                            if (i < array.size()) {
                                value = array.get(i);
                                if (value instanceof Utf8) {
                                    value = value.toString();
                                }
                            } else {
                                value = null;
                            }
                            statement.setObject(jdbcIndex, value);
                            ++jdbcIndex;
                        }
                    } else {
                        if (value instanceof Utf8) {
                            value = value.toString();
                        }
                        statement.setObject(jdbcIndex, value);
                        ++jdbcIndex;
                    }
                }
                statement.execute();
            }
            dataFileReader.close();
        }
        LOGGER.info("Insertion took " + (System.currentTimeMillis() - start));
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) DataFileReader(org.apache.avro.file.DataFileReader) GenericRecord(org.apache.avro.generic.GenericRecord) PreparedStatement(java.sql.PreparedStatement) GenericData(org.apache.avro.generic.GenericData) JSONException(org.json.JSONException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) SQLException(java.sql.SQLException) IOException(java.io.IOException) JSONArray(org.json.JSONArray) Utf8(org.apache.avro.util.Utf8) JSONObject(org.json.JSONObject) File(java.io.File)

Example 2 with DataFileReader

use of org.apache.avro.file.DataFileReader in project pinot by linkedin.

the class QueryGenerator method addAvroData.

/**
   * Helper method to read in an Avro file and add data to the storage.
   *
   * @param avroFile Avro file.
   */
private void addAvroData(File avroFile) {
    // Read in records and update the values stored.
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroFile, datumReader)) {
        for (GenericRecord genericRecord : fileReader) {
            for (String columnName : _columnNames) {
                Set<String> values = _columnToValueSet.get(columnName);
                // Turn the Avro value into a valid SQL String token.
                Object avroValue = genericRecord.get(columnName);
                if (avroValue != null) {
                    Integer storedMaxNumElements = _multiValueColumnMaxNumElements.get(columnName);
                    if (storedMaxNumElements != null) {
                        // Multi-value column
                        GenericData.Array array = (GenericData.Array) avroValue;
                        int numElements = array.size();
                        if (storedMaxNumElements < numElements) {
                            _multiValueColumnMaxNumElements.put(columnName, numElements);
                        }
                        for (Object element : array) {
                            storeAvroValueIntoValueSet(values, element);
                        }
                    } else {
                        // Single-value column
                        storeAvroValueIntoValueSet(values, avroValue);
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericData(org.apache.avro.generic.GenericData) JSONArray(org.json.JSONArray) DataFileReader(org.apache.avro.file.DataFileReader) JSONObject(org.json.JSONObject) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 3 with DataFileReader

use of org.apache.avro.file.DataFileReader in project crunch by cloudera.

the class AvroFileReaderFactory method read.

@Override
public Iterator<T> read(FileSystem fs, final Path path) {
    this.mapFn.setConfigurationForTest(conf);
    this.mapFn.initialize();
    try {
        FsInput fsi = new FsInput(path, fs.getConf());
        final DataFileReader<T> reader = new DataFileReader<T>(fsi, recordReader);
        return new UnmodifiableIterator<T>() {

            @Override
            public boolean hasNext() {
                return reader.hasNext();
            }

            @Override
            public T next() {
                return mapFn.map(reader.next());
            }
        };
    } catch (IOException e) {
        LOG.info("Could not read avro file at path: " + path, e);
        return Iterators.emptyIterator();
    }
}
Also used : UnmodifiableIterator(com.google.common.collect.UnmodifiableIterator) DataFileReader(org.apache.avro.file.DataFileReader) FsInput(org.apache.avro.mapred.FsInput) IOException(java.io.IOException)

Example 4 with DataFileReader

use of org.apache.avro.file.DataFileReader in project beam by apache.

the class AvroIOTest method assertTestOutputs.

public static void assertTestOutputs(String[] expectedElements, int numShards, String outputFilePrefix, String shardNameTemplate) throws IOException {
    // Validate that the data written matches the expected elements in the expected order
    List<File> expectedFiles = new ArrayList<>();
    for (int i = 0; i < numShards; i++) {
        expectedFiles.add(new File(DefaultFilenamePolicy.constructName(outputFilePrefix, shardNameTemplate, "", /* no suffix */
        i, numShards)));
    }
    List<String> actualElements = new ArrayList<>();
    for (File outputFile : expectedFiles) {
        assertTrue("Expected output file " + outputFile.getName(), outputFile.exists());
        try (DataFileReader<String> reader = new DataFileReader<>(outputFile, new ReflectDatumReader(ReflectData.get().getSchema(String.class)))) {
            Iterators.addAll(actualElements, reader);
        }
    }
    assertThat(actualElements, containsInAnyOrder(expectedElements));
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) ArrayList(java.util.ArrayList) ReflectDatumReader(org.apache.avro.reflect.ReflectDatumReader) File(java.io.File)

Example 5 with DataFileReader

use of org.apache.avro.file.DataFileReader in project flink by apache.

the class AvroInputFormat method initReader.

private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
    DatumReader<E> datumReader;
    if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
        datumReader = new GenericDatumReader<E>();
    } else {
        datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("Opening split {}", split);
    }
    SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
    DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
    }
    end = split.getStart() + split.getLength();
    recordsReadSinceLastSync = 0;
    return dataFileReader;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) SeekableInput(org.apache.avro.file.SeekableInput) FSDataInputStreamWrapper(org.apache.flink.api.avro.FSDataInputStreamWrapper) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ReflectDatumReader(org.apache.avro.reflect.ReflectDatumReader)

Aggregations

DataFileReader (org.apache.avro.file.DataFileReader)42 File (java.io.File)24 GenericRecord (org.apache.avro.generic.GenericRecord)24 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)18 Schema (org.apache.avro.Schema)17 Test (org.junit.Test)10 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)6 Test (org.testng.annotations.Test)6 SeekableInput (org.apache.avro.file.SeekableInput)5 ReflectDatumReader (org.apache.avro.reflect.ReflectDatumReader)5 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)5 Configuration (org.apache.hadoop.conf.Configuration)5 SeekableByteArrayInput (org.apache.avro.file.SeekableByteArrayInput)4 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)4 Utf8 (org.apache.avro.util.Utf8)3 JsonObject (com.google.gson.JsonObject)2 AvroDag (edu.snu.mist.formats.avro.AvroDag)2 Date (java.sql.Date)2 Time (java.sql.Time)2