use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.
the class PinotSegmentToAvroConverter method convert.
@Override
public void convert() throws Exception {
PinotSegmentRecordReader recordReader = new PinotSegmentRecordReader(new File(_segmentDir));
try {
recordReader.init();
Schema avroSchema = buildAvroSchemaFromPinotSchema(recordReader.getSchema());
try (DataFileWriter<Record> recordWriter = new DataFileWriter<>(new GenericDatumWriter<Record>(avroSchema))) {
recordWriter.create(avroSchema, new File(_outputFile));
while (recordReader.hasNext()) {
GenericRow row = recordReader.next();
Record record = new Record(avroSchema);
for (String field : row.getFieldNames()) {
Object value = row.getValue(field);
if (value instanceof Object[]) {
record.put(field, Arrays.asList((Object[]) value));
} else {
record.put(field, value);
}
}
recordWriter.append(record);
}
}
} finally {
recordReader.close();
}
}
use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.
the class PregeneratedHllTest method createAvroWithHll.
public File createAvroWithHll(File newAvroFile, String inputAvro, String column, int log2m) throws IOException {
String filePath = TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource(inputAvro));
try (DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath))) {
Schema currentSchema = avroReader.getSchema();
List<Schema.Field> fields = currentSchema.getFields();
List<Schema.Field> newFieldList = new ArrayList<>(fields.size());
for (Schema.Field field : fields) {
newFieldList.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue()));
}
final String hllColumnName = column + "_hll";
newFieldList.add(new Schema.Field(hllColumnName, Schema.create(Schema.Type.STRING), null, null));
Schema updatedSchema = Schema.createRecord("hllschema", "doc", this.getClass().getName(), false);
updatedSchema.setFields(newFieldList);
try (DataFileWriter<GenericData.Record> writer = new DataFileWriter<GenericData.Record>(new GenericDatumWriter<GenericData.Record>(updatedSchema))) {
writer.create(updatedSchema, newAvroFile);
while (avroReader.hasNext()) {
GenericRecord record = avroReader.next();
GenericData.Record newRecord = new GenericData.Record(updatedSchema);
for (Schema.Field field : fields) {
newRecord.put(field.name(), record.get(field.name()));
}
newRecord.put(hllColumnName, HllUtil.singleValueHllAsString(log2m, record.get(column)));
writer.append(newRecord);
}
}
}
return newAvroFile;
}
use of org.apache.avro.file.DataFileWriter in project crunch by cloudera.
the class MultiAvroSchemaJoinTest method setUp.
@Before
public void setUp() throws Exception {
this.personFile = File.createTempFile("person", ".avro");
this.employeeFile = File.createTempFile("employee", ".avro");
DatumWriter<Person> pdw = new SpecificDatumWriter<Person>();
DataFileWriter<Person> pfw = new DataFileWriter<Person>(pdw);
pfw.create(Person.SCHEMA$, personFile);
Person p1 = new Person();
p1.setName("Josh");
p1.setAge(19);
p1.setSiblingnames(ImmutableList.<CharSequence>of("Kate", "Mike"));
pfw.append(p1);
Person p2 = new Person();
p2.setName("Kate");
p2.setAge(17);
p2.setSiblingnames(ImmutableList.<CharSequence>of("Josh", "Mike"));
pfw.append(p2);
Person p3 = new Person();
p3.setName("Mike");
p3.setAge(12);
p3.setSiblingnames(ImmutableList.<CharSequence>of("Josh", "Kate"));
pfw.append(p3);
pfw.close();
DatumWriter<Employee> edw = new SpecificDatumWriter<Employee>();
DataFileWriter<Employee> efw = new DataFileWriter<Employee>(edw);
efw.create(Employee.SCHEMA$, employeeFile);
Employee e1 = new Employee();
e1.setName("Kate");
e1.setSalary(100000);
e1.setDepartment("Marketing");
efw.append(e1);
efw.close();
}
use of org.apache.avro.file.DataFileWriter in project sling by apache.
the class AvroContentSerializer method exportToStream.
@Override
public void exportToStream(ResourceResolver resourceResolver, DistributionExportOptions options, OutputStream outputStream) throws DistributionException {
DatumWriter<AvroShallowResource> datumWriter = new SpecificDatumWriter<AvroShallowResource>(AvroShallowResource.class);
DataFileWriter<AvroShallowResource> writer = new DataFileWriter<AvroShallowResource>(datumWriter);
try {
writer.create(schema, outputStream);
} catch (IOException e) {
throw new DistributionException(e);
}
try {
DistributionExportFilter filter = options.getFilter();
for (DistributionExportFilter.TreeFilter treeFilter : filter.getNodeFilters()) {
String path = treeFilter.getPath();
Resource resource = resourceResolver.getResource(path);
AvroShallowResource avroShallowResource = getAvroShallowResource(treeFilter, filter.getPropertyFilter(), resource);
writer.append(avroShallowResource);
}
outputStream.flush();
} catch (Exception e) {
throw new DistributionException(e);
} finally {
try {
writer.close();
} catch (IOException e) {
// do nothing
}
}
}
use of org.apache.avro.file.DataFileWriter in project nifi by apache.
the class TestConvertAvroToORC method test_onTrigger_complex_record.
@Test
public void test_onTrigger_complex_record() throws Exception {
Map<String, Double> mapData1 = new TreeMap<String, Double>() {
{
put("key1", 1.0);
put("key2", 2.0);
}
};
GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record.getSchema(), out);
fileWriter.append(record);
// Put another record in
Map<String, Double> mapData2 = new TreeMap<String, Double>() {
{
put("key1", 3.0);
put("key2", 4.0);
}
};
record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
fileWriter.append(record);
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {
{
put(CoreAttributes.FILENAME.key(), "test");
}
};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " + "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)" + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
Object o = rows.next(null);
assertNotNull(o);
assertTrue(o instanceof OrcStruct);
TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);
// Check some fields in the first row
Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
assertTrue(intFieldObject instanceof IntWritable);
assertEquals(10, ((IntWritable) intFieldObject).get());
Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
assertTrue(mapFieldObject instanceof Map);
Map map = (Map) mapFieldObject;
Object mapValue = map.get(new Text("key1"));
assertNotNull(mapValue);
assertTrue(mapValue instanceof DoubleWritable);
assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
mapValue = map.get(new Text("key2"));
assertNotNull(mapValue);
assertTrue(mapValue instanceof DoubleWritable);
assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
Aggregations