use of org.apache.avro.file.DataFileReader in project pinot by linkedin.
the class BaseClusterIntegrationTest method createH2SchemaAndInsertAvroFiles.
public static void createH2SchemaAndInsertAvroFiles(List<File> avroFiles, Connection connection) {
try {
connection.prepareCall("DROP TABLE IF EXISTS mytable");
File schemaAvroFile = avroFiles.get(0);
DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(schemaAvroFile, datumReader);
Schema schema = dataFileReader.getSchema();
List<Schema.Field> fields = schema.getFields();
List<String> columnNamesAndTypes = new ArrayList<String>(fields.size());
int columnCount = 0;
for (Schema.Field field : fields) {
String fieldName = field.name();
Schema.Type fieldType = field.schema().getType();
switch(fieldType) {
case UNION:
List<Schema> types = field.schema().getTypes();
String columnNameAndType;
String typeName = types.get(0).getName();
if (typeName.equalsIgnoreCase("int")) {
typeName = "bigint";
}
if (types.size() == 1) {
columnNameAndType = fieldName + " " + typeName + " not null";
} else {
columnNameAndType = fieldName + " " + typeName;
}
columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
++columnCount;
break;
case ARRAY:
String elementTypeName = field.schema().getElementType().getName();
if (elementTypeName.equalsIgnoreCase("int")) {
elementTypeName = "bigint";
}
elementTypeName = elementTypeName.replace("string", "varchar(128)");
for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
columnNamesAndTypes.add(fieldName + "__MV" + i + " " + elementTypeName);
}
++columnCount;
break;
case BOOLEAN:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case STRING:
String fieldTypeName = fieldType.getName();
if (fieldTypeName.equalsIgnoreCase("int")) {
fieldTypeName = "bigint";
}
columnNameAndType = fieldName + " " + fieldTypeName + " not null";
columnNamesAndTypes.add(columnNameAndType.replace("string", "varchar(128)"));
++columnCount;
break;
case RECORD:
// Ignore records
continue;
default:
// Ignore other avro types
LOGGER.warn("Ignoring field {} of type {}", fieldName, field.schema());
}
}
connection.prepareCall("create table mytable(" + StringUtil.join(",", columnNamesAndTypes.toArray(new String[columnNamesAndTypes.size()])) + ")").execute();
long start = System.currentTimeMillis();
StringBuilder params = new StringBuilder("?");
for (int i = 0; i < columnNamesAndTypes.size() - 1; i++) {
params.append(",?");
}
PreparedStatement statement = connection.prepareStatement("INSERT INTO mytable VALUES (" + params.toString() + ")");
dataFileReader.close();
for (File avroFile : avroFiles) {
datumReader = new GenericDatumReader<GenericRecord>();
dataFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
GenericRecord record = null;
while (dataFileReader.hasNext()) {
record = dataFileReader.next(record);
int jdbcIndex = 1;
for (int avroIndex = 0; avroIndex < columnCount; ++avroIndex) {
Object value = record.get(avroIndex);
if (value instanceof GenericData.Array) {
GenericData.Array array = (GenericData.Array) value;
for (int i = 0; i < MAX_ELEMENTS_IN_MULTI_VALUE; i++) {
if (i < array.size()) {
value = array.get(i);
if (value instanceof Utf8) {
value = value.toString();
}
} else {
value = null;
}
statement.setObject(jdbcIndex, value);
++jdbcIndex;
}
} else {
if (value instanceof Utf8) {
value = value.toString();
}
statement.setObject(jdbcIndex, value);
++jdbcIndex;
}
}
statement.execute();
}
dataFileReader.close();
}
LOGGER.info("Insertion took " + (System.currentTimeMillis() - start));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.avro.file.DataFileReader in project pinot by linkedin.
the class QueryGenerator method addAvroData.
/**
* Helper method to read in an Avro file and add data to the storage.
*
* @param avroFile Avro file.
*/
private void addAvroData(File avroFile) {
// Read in records and update the values stored.
GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroFile, datumReader)) {
for (GenericRecord genericRecord : fileReader) {
for (String columnName : _columnNames) {
Set<String> values = _columnToValueSet.get(columnName);
// Turn the Avro value into a valid SQL String token.
Object avroValue = genericRecord.get(columnName);
if (avroValue != null) {
Integer storedMaxNumElements = _multiValueColumnMaxNumElements.get(columnName);
if (storedMaxNumElements != null) {
// Multi-value column
GenericData.Array array = (GenericData.Array) avroValue;
int numElements = array.size();
if (storedMaxNumElements < numElements) {
_multiValueColumnMaxNumElements.put(columnName, numElements);
}
for (Object element : array) {
storeAvroValueIntoValueSet(values, element);
}
} else {
// Single-value column
storeAvroValueIntoValueSet(values, avroValue);
}
}
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.avro.file.DataFileReader in project crunch by cloudera.
the class AvroFileReaderFactory method read.
@Override
public Iterator<T> read(FileSystem fs, final Path path) {
this.mapFn.setConfigurationForTest(conf);
this.mapFn.initialize();
try {
FsInput fsi = new FsInput(path, fs.getConf());
final DataFileReader<T> reader = new DataFileReader<T>(fsi, recordReader);
return new UnmodifiableIterator<T>() {
@Override
public boolean hasNext() {
return reader.hasNext();
}
@Override
public T next() {
return mapFn.map(reader.next());
}
};
} catch (IOException e) {
LOG.info("Could not read avro file at path: " + path, e);
return Iterators.emptyIterator();
}
}
use of org.apache.avro.file.DataFileReader in project beam by apache.
the class AvroIOTest method assertTestOutputs.
public static void assertTestOutputs(String[] expectedElements, int numShards, String outputFilePrefix, String shardNameTemplate) throws IOException {
// Validate that the data written matches the expected elements in the expected order
List<File> expectedFiles = new ArrayList<>();
for (int i = 0; i < numShards; i++) {
expectedFiles.add(new File(DefaultFilenamePolicy.constructName(outputFilePrefix, shardNameTemplate, "", /* no suffix */
i, numShards)));
}
List<String> actualElements = new ArrayList<>();
for (File outputFile : expectedFiles) {
assertTrue("Expected output file " + outputFile.getName(), outputFile.exists());
try (DataFileReader<String> reader = new DataFileReader<>(outputFile, new ReflectDatumReader(ReflectData.get().getSchema(String.class)))) {
Iterators.addAll(actualElements, reader);
}
}
assertThat(actualElements, containsInAnyOrder(expectedElements));
}
use of org.apache.avro.file.DataFileReader in project flink by apache.
the class AvroInputFormat method initReader.
private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
DatumReader<E> datumReader;
if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
datumReader = new GenericDatumReader<E>();
} else {
datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
}
if (LOG.isInfoEnabled()) {
LOG.info("Opening split {}", split);
}
SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);
if (LOG.isDebugEnabled()) {
LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
}
end = split.getStart() + split.getLength();
recordsReadSinceLastSync = 0;
return dataFileReader;
}
Aggregations