use of org.apache.avro.file.DataFileStream in project components by Talend.
the class AvroHdfsFileSink method mergeOutput.
@Override
protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) {
try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) {
FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
Schema schema = null;
String inputCodec = null;
OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile)));
for (FileStatus sourceStatus : sourceStatuses) {
try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) {
if (schema == null) {
schema = reader.getSchema();
for (String key : reader.getMetaKeys()) {
if (!DataFileWriter.isReservedMeta(key)) {
writer.setMeta(key, reader.getMeta(key));
}
}
inputCodec = reader.getMetaString(DataFileConstants.CODEC);
if (inputCodec == null) {
inputCodec = DataFileConstants.NULL_CODEC;
}
writer.setCodec(CodecFactory.fromString(inputCodec));
writer.create(schema, output);
}
writer.appendAllFrom(reader, false);
}
}
} catch (Exception e) {
LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage());
return false;
}
return true;
}
use of org.apache.avro.file.DataFileStream in project hive by apache.
the class AvroLazyObjectInspector method retrieveSchemaFromBytes.
/**
* Retrieve schema from the given bytes
*
* @return the retrieved {@link Schema schema}
*/
private Schema retrieveSchemaFromBytes(byte[] data) {
ByteArrayInputStream bais = new ByteArrayInputStream(data);
DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
Schema schema = null;
try {
// dfs is AutoCloseable
@SuppressWarnings("resource") DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, reader);
schema = dfs.getSchema();
} catch (IOException ioe) {
throw new AvroObjectInspectorException("An error occurred retrieving schema from bytes", ioe);
}
return schema;
}
use of org.apache.avro.file.DataFileStream in project hive by apache.
the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.
@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
// Need to verify that when reading a datum with an updated reader schema
// that the datum then returns the reader schema as its own, since we
// depend on this behavior in order to avoid re-encoding the datum
// in the serde.
String v0 = "{\n" + " \"namespace\": \"org.apache.hadoop.hive\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " }\n" + " ]\n" + "}";
String v1 = "{\n" + " \"namespace\": \"org.apache.hadoop.hive\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " },\n" + " {\n" + " \"name\":\"v1\",\n" + " \"type\":\"string\",\n" + " \"default\":\"v1_default\"" + " }\n" + " ]\n" + "}";
Schema[] schemas = { AvroSerdeUtils.getSchemaFor(v0), AvroSerdeUtils.getSchemaFor(v1) };
// Encode a schema with v0, write out.
GenericRecord record = new GenericData.Record(schemas[0]);
record.put("v0", "v0 value");
assertTrue(GenericData.get().validate(schemas[0], record));
// Write datum out to a stream
GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
dfw.create(schemas[0], baos);
dfw.append(record);
dfw.close();
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
gdr.setExpected(schemas[1]);
DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
assertTrue(dfs.hasNext());
GenericRecord next = dfs.next();
assertEquals("v0 value", next.get("v0").toString());
assertEquals("v1_default", next.get("v1").toString());
// Now the most important check - when we query this record for its schema,
// we should get back the latest, reader schema:
assertEquals(schemas[1], next.getSchema());
}
use of org.apache.avro.file.DataFileStream in project haivvreo by jghoman.
the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.
@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
// Need to verify that when reading a datum with an updated reader schema
// that the datum then returns the reader schema as its own, since we
// depend on this behavior in order to avoid re-encoding the datum
// in the serde.
String v0 = "{\n" + " \"namespace\": \"com.linkedin.haivvreo\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " }\n" + " ]\n" + "}";
String v1 = "{\n" + " \"namespace\": \"com.linkedin.haivvreo\",\n" + " \"name\": \"SomeStuff\",\n" + " \"type\": \"record\",\n" + " \"fields\": [\n" + " {\n" + " \"name\":\"v0\",\n" + " \"type\":\"string\"\n" + " },\n" + " {\n" + " \"name\":\"v1\",\n" + " \"type\":\"string\",\n" + " \"default\":\"v1_default\"" + " }\n" + " ]\n" + "}";
Schema[] schemas = { Schema.parse(v0), Schema.parse(v1) };
// Encode a schema with v0, write out.
GenericRecord record = new GenericData.Record(schemas[0]);
record.put("v0", "v0 value");
assertTrue(GenericData.get().validate(schemas[0], record));
// Write datum out to a stream
GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
dfw.create(schemas[0], baos);
dfw.append(record);
dfw.close();
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
gdr.setExpected(schemas[1]);
DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
assertTrue(dfs.hasNext());
GenericRecord next = dfs.next();
assertEquals("v0 value", next.get("v0").toString());
assertEquals("v1_default", next.get("v1").toString());
// Now the most important check - when we query this record for its schema,
// we should get back the latest, reader schema:
assertEquals(schemas[1], next.getSchema());
}
use of org.apache.avro.file.DataFileStream in project voldemort by voldemort.
the class AvroUtils method getSchemaFromPath.
/**
* Pull the schema off of the given file (if it is a file). If it is a
* directory, then pull schemas off of all subfiles, and check that they are
* all the same schema. If so, return that schema, otherwise throw an
* exception
*
* @param fs The filesystem to use
* @param path The path from which to get the schema
* @return The schema of this file or all its subfiles
* @throws IOException
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path) {
try {
if (fs.isFile(path)) {
BufferedInputStream inStream = null;
try {
inStream = new BufferedInputStream(fs.open(path));
} catch (IOException e1) {
throw new RuntimeException("Unable to open " + path, e1);
}
GenericDatumReader datum = new GenericDatumReader();
DataFileStream reader = null;
try {
reader = new DataFileStream(inStream, datum);
} catch (IOException e) {
throw new RuntimeException("Invalid avro format, path " + path, e);
}
return reader.getSchema();
} else {
FileStatus[] statuses = null;
if (fs.isDirectory(path)) {
// this is a directory, get schemas from all subfiles
statuses = fs.listStatus(path);
if (statuses == null || statuses.length == 0)
throw new IllegalArgumentException("No files in directory " + path);
} else {
// this is wildcard path, get schemas from all matched files
statuses = fs.globStatus(path);
if (statuses == null || statuses.length == 0)
throw new IllegalArgumentException("No matches for path pattern " + path);
}
List<Schema> schemas = new ArrayList<Schema>();
for (FileStatus status : statuses) {
if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
schemas.add(getSchemaFromPath(fs, status.getPath()));
}
}
// now check that all the schemas are the same
if (schemas.size() > 0) {
Schema schema = schemas.get(0);
for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i)))
throw new IllegalArgumentException("The directory " + path + " contains heterogeneous schemas: found both '" + schema + "' and '" + schemas.get(i) + "'.");
return schema;
} else {
throw new IllegalArgumentException("No valid metadata file found for path " + path);
}
}
} catch (Exception e) {
throw new RuntimeException("Error getting schema for path " + path, e);
}
}
Aggregations