use of org.apache.avro.file.DataFileStream in project hive by apache.
the class AvroLazyObjectInspector method retrieveSchemaFromBytes.
/**
* Retrieve schema from the given bytes
*
* @return the retrieved {@link Schema schema}
* */
private Schema retrieveSchemaFromBytes(byte[] data) {
ByteArrayInputStream bais = new ByteArrayInputStream(data);
DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
Schema schema = null;
try {
// dfs is AutoCloseable
@SuppressWarnings("resource") DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, reader);
schema = dfs.getSchema();
} catch (IOException ioe) {
throw new AvroObjectInspectorException("An error occurred retrieving schema from bytes", ioe);
}
return schema;
}
use of org.apache.avro.file.DataFileStream in project haivvreo by jghoman.
the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.
@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
// Need to verify that when reading a datum with an updated reader schema
// that the datum then returns the reader schema as its own, since we
// depend on this behavior in order to avoid re-encoding the datum
// in the serde.
String v0 = "{\n" + " \"namespace\": \"com.linkedin.haivvreo\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " }\n" + " ]\n" + "}";
String v1 = "{\n" + " \"namespace\": \"com.linkedin.haivvreo\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " },\n" + " {\n" + " \"name\":\"v1\",\n" + " \"type\":\"string\",\n" + " \"default\":\"v1_default\"" + " }\n" + " ]\n" + "}";
Schema[] schemas = { Schema.parse(v0), Schema.parse(v1) };
// Encode a schema with v0, write out.
GenericRecord record = new GenericData.Record(schemas[0]);
record.put("v0", "v0 value");
assertTrue(GenericData.get().validate(schemas[0], record));
// Write datum out to a stream
GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
dfw.create(schemas[0], baos);
dfw.append(record);
dfw.close();
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
gdr.setExpected(schemas[1]);
DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
assertTrue(dfs.hasNext());
GenericRecord next = dfs.next();
assertEquals("v0 value", next.get("v0").toString());
assertEquals("v1_default", next.get("v1").toString());
// Now the most important check - when we query this record for its schema,
// we should get back the latest, reader schema:
assertEquals(schemas[1], next.getSchema());
}
use of org.apache.avro.file.DataFileStream in project pinot by linkedin.
the class SegmentTestUtils method getColumnNamesFromAvro.
public static List<String> getColumnNamesFromAvro(File avro) throws FileNotFoundException, IOException {
List<String> ret = new ArrayList<String>();
DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>());
for (final Field field : dataStream.getSchema().getFields()) {
ret.add(field.name());
}
return ret;
}
use of org.apache.avro.file.DataFileStream in project pinot by linkedin.
the class SegmentTestUtils method extractSchemaFromAvro.
public static Schema extractSchemaFromAvro(File avroFile, Map<String, FieldType> fieldTypeMap, TimeUnit granularity) throws IOException {
DataFileStream<GenericRecord> dataStream = new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
Schema schema = new Schema();
for (final Field field : dataStream.getSchema().getFields()) {
final String columnName = field.name();
FieldType fieldType = fieldTypeMap.get(columnName);
Preconditions.checkNotNull(fieldType);
switch(fieldType) {
case TIME:
final TimeGranularitySpec gSpec = new TimeGranularitySpec(getColumnType(field), granularity, columnName);
final TimeFieldSpec fSpec = new TimeFieldSpec(gSpec);
schema.addField(fSpec);
continue;
case DIMENSION:
final FieldSpec dimensionFieldSpec = new DimensionFieldSpec(columnName, getColumnType(field), isSingleValueField(field));
schema.addField(dimensionFieldSpec);
continue;
case METRIC:
final FieldSpec metricFieldSpec = new MetricFieldSpec(columnName, getColumnType(field));
schema.addField(metricFieldSpec);
continue;
default:
throw new UnsupportedOperationException("Unsupported field type: " + fieldType);
}
}
dataStream.close();
return schema;
}
use of org.apache.avro.file.DataFileStream in project voldemort by voldemort.
the class AvroUtils method getSchemaFromPath.
/**
* Pull the schema off of the given file (if it is a file). If it is a
* directory, then pull schemas off of all subfiles, and check that they are
* all the same schema. If so, return that schema, otherwise throw an
* exception
*
* @param fs The filesystem to use
* @param path The path from which to get the schema
* @return The schema of this file or all its subfiles
* @throws IOException
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path) {
try {
if (fs.isFile(path)) {
BufferedInputStream inStream = null;
try {
inStream = new BufferedInputStream(fs.open(path));
} catch (IOException e1) {
throw new RuntimeException("Unable to open " + path, e1);
}
GenericDatumReader datum = new GenericDatumReader();
DataFileStream reader = null;
try {
reader = new DataFileStream(inStream, datum);
} catch (IOException e) {
throw new RuntimeException("Invalid avro format, path " + path, e);
}
return reader.getSchema();
} else {
FileStatus[] statuses = null;
if (fs.isDirectory(path)) {
// this is a directory, get schemas from all subfiles
statuses = fs.listStatus(path);
if (statuses == null || statuses.length == 0)
throw new IllegalArgumentException("No files in directory " + path);
} else {
// this is wildcard path, get schemas from all matched files
statuses = fs.globStatus(path);
if (statuses == null || statuses.length == 0)
throw new IllegalArgumentException("No matches for path pattern " + path);
}
List<Schema> schemas = new ArrayList<Schema>();
for (FileStatus status : statuses) {
if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
schemas.add(getSchemaFromPath(fs, status.getPath()));
}
}
// now check that all the schemas are the same
if (schemas.size() > 0) {
Schema schema = schemas.get(0);
for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i)))
throw new IllegalArgumentException("The directory " + path + " contains heterogeneous schemas: found both '" + schema + "' and '" + schemas.get(i) + "'.");
return schema;
} else {
throw new IllegalArgumentException("No valid metadata file found for path " + path);
}
}
} catch (Exception e) {
throw new RuntimeException("Error getting schema for path " + path, e);
}
}
Aggregations