Search in sources :

Example 61 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.

the class PinotSegmentToAvroConverter method convert.

@Override
public void convert() throws Exception {
    PinotSegmentRecordReader recordReader = new PinotSegmentRecordReader(new File(_segmentDir));
    try {
        recordReader.init();
        Schema avroSchema = buildAvroSchemaFromPinotSchema(recordReader.getSchema());
        try (DataFileWriter<Record> recordWriter = new DataFileWriter<>(new GenericDatumWriter<Record>(avroSchema))) {
            recordWriter.create(avroSchema, new File(_outputFile));
            while (recordReader.hasNext()) {
                GenericRow row = recordReader.next();
                Record record = new Record(avroSchema);
                for (String field : row.getFieldNames()) {
                    Object value = row.getValue(field);
                    if (value instanceof Object[]) {
                        record.put(field, Arrays.asList((Object[]) value));
                    } else {
                        record.put(field, value);
                    }
                }
                recordWriter.append(record);
            }
        }
    } finally {
        recordReader.close();
    }
}
Also used : GenericRow(com.linkedin.pinot.core.data.GenericRow) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) Record(org.apache.avro.generic.GenericData.Record) File(java.io.File) PinotSegmentRecordReader(com.linkedin.pinot.core.data.readers.PinotSegmentRecordReader)

Example 62 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.

the class PregeneratedHllTest method createAvroWithHll.

public File createAvroWithHll(File newAvroFile, String inputAvro, String column, int log2m) throws IOException {
    String filePath = TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource(inputAvro));
    try (DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath))) {
        Schema currentSchema = avroReader.getSchema();
        List<Schema.Field> fields = currentSchema.getFields();
        List<Schema.Field> newFieldList = new ArrayList<>(fields.size());
        for (Schema.Field field : fields) {
            newFieldList.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue()));
        }
        final String hllColumnName = column + "_hll";
        newFieldList.add(new Schema.Field(hllColumnName, Schema.create(Schema.Type.STRING), null, null));
        Schema updatedSchema = Schema.createRecord("hllschema", "doc", this.getClass().getName(), false);
        updatedSchema.setFields(newFieldList);
        try (DataFileWriter<GenericData.Record> writer = new DataFileWriter<GenericData.Record>(new GenericDatumWriter<GenericData.Record>(updatedSchema))) {
            writer.create(updatedSchema, newAvroFile);
            while (avroReader.hasNext()) {
                GenericRecord record = avroReader.next();
                GenericData.Record newRecord = new GenericData.Record(updatedSchema);
                for (Schema.Field field : fields) {
                    newRecord.put(field.name(), record.get(field.name()));
                }
                newRecord.put(hllColumnName, HllUtil.singleValueHllAsString(log2m, record.get(column)));
                writer.append(newRecord);
            }
        }
    }
    return newAvroFile;
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) ArrayList(java.util.ArrayList) GenericData(org.apache.avro.generic.GenericData) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 63 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project crunch by cloudera.

the class MultiAvroSchemaJoinTest method setUp.

@Before
public void setUp() throws Exception {
    this.personFile = File.createTempFile("person", ".avro");
    this.employeeFile = File.createTempFile("employee", ".avro");
    DatumWriter<Person> pdw = new SpecificDatumWriter<Person>();
    DataFileWriter<Person> pfw = new DataFileWriter<Person>(pdw);
    pfw.create(Person.SCHEMA$, personFile);
    Person p1 = new Person();
    p1.setName("Josh");
    p1.setAge(19);
    p1.setSiblingnames(ImmutableList.<CharSequence>of("Kate", "Mike"));
    pfw.append(p1);
    Person p2 = new Person();
    p2.setName("Kate");
    p2.setAge(17);
    p2.setSiblingnames(ImmutableList.<CharSequence>of("Josh", "Mike"));
    pfw.append(p2);
    Person p3 = new Person();
    p3.setName("Mike");
    p3.setAge(12);
    p3.setSiblingnames(ImmutableList.<CharSequence>of("Josh", "Kate"));
    pfw.append(p3);
    pfw.close();
    DatumWriter<Employee> edw = new SpecificDatumWriter<Employee>();
    DataFileWriter<Employee> efw = new DataFileWriter<Employee>(edw);
    efw.create(Employee.SCHEMA$, employeeFile);
    Employee e1 = new Employee();
    e1.setName("Kate");
    e1.setSalary(100000);
    e1.setDepartment("Marketing");
    efw.append(e1);
    efw.close();
}
Also used : Employee(org.apache.crunch.test.Employee) DataFileWriter(org.apache.avro.file.DataFileWriter) Person(org.apache.crunch.test.Person) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter) Before(org.junit.Before)

Example 64 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project sling by apache.

the class AvroContentSerializer method exportToStream.

@Override
public void exportToStream(ResourceResolver resourceResolver, DistributionExportOptions options, OutputStream outputStream) throws DistributionException {
    DatumWriter<AvroShallowResource> datumWriter = new SpecificDatumWriter<AvroShallowResource>(AvroShallowResource.class);
    DataFileWriter<AvroShallowResource> writer = new DataFileWriter<AvroShallowResource>(datumWriter);
    try {
        writer.create(schema, outputStream);
    } catch (IOException e) {
        throw new DistributionException(e);
    }
    try {
        DistributionExportFilter filter = options.getFilter();
        for (DistributionExportFilter.TreeFilter treeFilter : filter.getNodeFilters()) {
            String path = treeFilter.getPath();
            Resource resource = resourceResolver.getResource(path);
            AvroShallowResource avroShallowResource = getAvroShallowResource(treeFilter, filter.getPropertyFilter(), resource);
            writer.append(avroShallowResource);
        }
        outputStream.flush();
    } catch (Exception e) {
        throw new DistributionException(e);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
        // do nothing
        }
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Resource(org.apache.sling.api.resource.Resource) DistributionException(org.apache.sling.distribution.common.DistributionException) DistributionExportFilter(org.apache.sling.distribution.serialization.DistributionExportFilter) IOException(java.io.IOException) DistributionException(org.apache.sling.distribution.common.DistributionException) PersistenceException(org.apache.sling.api.resource.PersistenceException) IOException(java.io.IOException) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter)

Example 65 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project nifi by apache.

the class TestConvertAvroToORC method test_onTrigger_complex_record.

@Test
public void test_onTrigger_complex_record() throws Exception {
    Map<String, Double> mapData1 = new TreeMap<String, Double>() {

        {
            put("key1", 1.0);
            put("key2", 2.0);
        }
    };
    GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));
    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    // Put another record in
    Map<String, Double> mapData2 = new TreeMap<String, Double>() {

        {
            put("key1", 3.0);
            put("key2", 4.0);
        }
    };
    record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();
    Map<String, String> attributes = new HashMap<String, String>() {

        {
            put(CoreAttributes.FILENAME.key(), "test");
        }
    };
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " + "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)" + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);
    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());
    Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
    assertTrue(mapFieldObject instanceof Map);
    Map map = (Map) mapFieldObject;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
    mapValue = map.get(new Text("key2"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) DoubleWritable(org.apache.hadoop.io.DoubleWritable) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) FileSystem(org.apache.hadoop.fs.FileSystem) GenericRecord(org.apache.avro.generic.GenericRecord) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) DataFileWriter(org.apache.avro.file.DataFileWriter) Text(org.apache.hadoop.io.Text) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TreeMap(java.util.TreeMap) GenericData(org.apache.avro.generic.GenericData) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MockFlowFile(org.apache.nifi.util.MockFlowFile) FileOutputStream(java.io.FileOutputStream) HashMap(java.util.HashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)102 GenericRecord (org.apache.avro.generic.GenericRecord)58 Schema (org.apache.avro.Schema)50 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)47 File (java.io.File)38 ByteArrayOutputStream (java.io.ByteArrayOutputStream)22 IOException (java.io.IOException)22 GenericData (org.apache.avro.generic.GenericData)17 FileOutputStream (java.io.FileOutputStream)15 Test (org.junit.Test)14 HashMap (java.util.HashMap)11 InputStream (java.io.InputStream)10 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)10 ArrayList (java.util.ArrayList)9 Path (org.apache.hadoop.fs.Path)9 ByteArrayInputStream (java.io.ByteArrayInputStream)8 OutputStream (java.io.OutputStream)8 ByteBuffer (java.nio.ByteBuffer)7 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)7 MockFlowFile (org.apache.nifi.util.MockFlowFile)7