Search in sources :

Example 26 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project h2o-3 by h2oai.

the class AvroFileGenerator method generateUnionTypes.

public static File generateUnionTypes(String filename, int nrows) throws IOException {
    File parentDir = Files.createTempDir();
    File f = new File(parentDir, filename);
    DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
    DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
    // Based on SchemaBuilder javadoc:
    // * The below two field declarations are equivalent:
    // * <pre>
    // *  .name("f").type().unionOf().nullType().and().longType().endUnion().nullDefault()
    // *  .name("f").type().optional().longType()
    // * </pre>
    Schema schema = SchemaBuilder.builder().record("test_union_types").fields().name("CUString").type().optional().stringType().name("CUBytes").type().optional().bytesType().name("CUInt").type().optional().intType().name("CULong").type().optional().longType().name("CUFloat").type().optional().floatType().name("CUDouble").type().optional().doubleType().name("CUBoolean").type().optional().booleanType().endRecord();
    try {
        dw.create(schema, f);
        for (int i = 0; i < nrows; i++) {
            GenericRecord gr = new GenericData.Record(schema);
            gr.put("CUString", i == 0 ? null : String.valueOf(i));
            gr.put("CUBytes", i == 0 ? null : ByteBuffer.wrap(StringUtils.toBytes(i)));
            gr.put("CUInt", i == 0 ? null : i);
            gr.put("CULong", i == 0 ? null : Long.valueOf(i));
            gr.put("CUFloat", i == 0 ? null : Float.valueOf(i));
            gr.put("CUDouble", i == 0 ? null : Double.valueOf(i));
            gr.put("CUBoolean", i == 0 ? null : (i & 1) == 1);
            dw.append(gr);
        }
        return f;
    } finally {
        dw.close();
        ;
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 27 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project h2o-3 by h2oai.

the class AvroFileGenerator method generatePrimitiveTypes.

public static File generatePrimitiveTypes(String filename, int nrows) throws IOException {
    File parentDir = Files.createTempDir();
    File f = new File(parentDir, filename);
    // Write output records
    DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
    DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
    Schema schema = SchemaBuilder.builder().record("test_primitive_types").fields().name("CString").type("string").noDefault().name("CBytes").type("bytes").noDefault().name("CInt").type("int").noDefault().name("CLong").type("long").noDefault().name("CFloat").type("float").noDefault().name("CDouble").type("double").noDefault().name("CBoolean").type("boolean").noDefault().name("CNull").type("null").noDefault().endRecord();
    try {
        dw.create(schema, f);
        for (int i = 0; i < nrows; i++) {
            GenericRecord gr = new GenericData.Record(schema);
            gr.put("CString", String.valueOf(i));
            gr.put("CBytes", ByteBuffer.wrap(StringUtils.toBytes(i)));
            gr.put("CInt", i);
            gr.put("CLong", Long.valueOf(i));
            gr.put("CFloat", Float.valueOf(i));
            gr.put("CDouble", Double.valueOf(i));
            gr.put("CBoolean", (i & 1) == 1);
            gr.put("CNull", null);
            dw.append(gr);
        }
        return f;
    } finally {
        dw.close();
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 28 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project h2o-3 by h2oai.

the class AvroFileGenerator method generateEnumTypes.

public static File generateEnumTypes(String filename, int nrows, String[][] categories) throws IOException {
    assert categories.length == 2 : "Needs only 2 columns";
    File parentDir = Files.createTempDir();
    File f = new File(parentDir, filename);
    DatumWriter<GenericRecord> w = new GenericDatumWriter<GenericRecord>();
    DataFileWriter<GenericRecord> dw = new DataFileWriter<GenericRecord>(w);
    Schema enumSchema1 = SchemaBuilder.enumeration("CEnum1").symbols(categories[0]);
    Schema enumSchema2 = SchemaBuilder.enumeration("CEnum2").symbols(categories[1]);
    Schema schema = SchemaBuilder.builder().record("test_enum_types").fields().name("CEnum").type(enumSchema1).noDefault().name("CUEnum").type().optional().type(enumSchema2).endRecord();
    System.out.println(schema);
    int numOfCategories1 = categories[0].length;
    int numOfCategories2 = categories[1].length;
    try {
        dw.create(schema, f);
        for (int i = 0; i < nrows; i++) {
            GenericRecord gr = new GenericData.Record(schema);
            gr.put("CEnum", new GenericData.EnumSymbol(enumSchema1, categories[0][i % numOfCategories1]));
            gr.put("CUEnum", i % (numOfCategories2 + 1) == 0 ? null : new GenericData.EnumSymbol(enumSchema2, categories[1][i % numOfCategories2]));
            dw.append(gr);
        }
        return f;
    } finally {
        dw.close();
        ;
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) GenericData(org.apache.avro.generic.GenericData)

Example 29 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project haivvreo by jghoman.

the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.

@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
    // Need to verify that when reading a datum with an updated reader schema
    // that the datum then returns the reader schema as its own, since we
    // depend on this behavior in order to avoid re-encoding the datum
    // in the serde.
    String v0 = "{\n" + "    \"namespace\": \"com.linkedin.haivvreo\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        }\n" + "    ]\n" + "}";
    String v1 = "{\n" + "    \"namespace\": \"com.linkedin.haivvreo\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        },\n" + "        {\n" + "            \"name\":\"v1\",\n" + "            \"type\":\"string\",\n" + "            \"default\":\"v1_default\"" + "        }\n" + "    ]\n" + "}";
    Schema[] schemas = { Schema.parse(v0), Schema.parse(v1) };
    // Encode a schema with v0, write out.
    GenericRecord record = new GenericData.Record(schemas[0]);
    record.put("v0", "v0 value");
    assertTrue(GenericData.get().validate(schemas[0], record));
    // Write datum out to a stream
    GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
    DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    dfw.create(schemas[0], baos);
    dfw.append(record);
    dfw.close();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
    gdr.setExpected(schemas[1]);
    DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
    assertTrue(dfs.hasNext());
    GenericRecord next = dfs.next();
    assertEquals("v0 value", next.get("v0").toString());
    assertEquals("v1_default", next.get("v1").toString());
    // Now the most important check - when we query this record for its schema,
    // we should get back the latest, reader schema:
    assertEquals(schemas[1], next.getSchema());
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataFileStream(org.apache.avro.file.DataFileStream) ByteArrayInputStream(java.io.ByteArrayInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 30 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.

the class PinotSegmentToAvroConverter method convert.

@Override
public void convert() throws Exception {
    PinotSegmentRecordReader recordReader = new PinotSegmentRecordReader(new File(_segmentDir));
    try {
        recordReader.init();
        Schema avroSchema = buildAvroSchemaFromPinotSchema(recordReader.getSchema());
        try (DataFileWriter<Record> recordWriter = new DataFileWriter<>(new GenericDatumWriter<Record>(avroSchema))) {
            recordWriter.create(avroSchema, new File(_outputFile));
            while (recordReader.hasNext()) {
                GenericRow row = recordReader.next();
                Record record = new Record(avroSchema);
                for (String field : row.getFieldNames()) {
                    Object value = row.getValue(field);
                    if (value instanceof Object[]) {
                        record.put(field, Arrays.asList((Object[]) value));
                    } else {
                        record.put(field, value);
                    }
                }
                recordWriter.append(record);
            }
        }
    } finally {
        recordReader.close();
    }
}
Also used : GenericRow(com.linkedin.pinot.core.data.GenericRow) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) Record(org.apache.avro.generic.GenericData.Record) File(java.io.File) PinotSegmentRecordReader(com.linkedin.pinot.core.data.readers.PinotSegmentRecordReader)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)34 Schema (org.apache.avro.Schema)21 GenericRecord (org.apache.avro.generic.GenericRecord)21 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)17 File (java.io.File)14 FileOutputStream (java.io.FileOutputStream)7 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)7 ByteArrayOutputStream (java.io.ByteArrayOutputStream)5 IOException (java.io.IOException)4 GenericData (org.apache.avro.generic.GenericData)4 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 DataFileStream (org.apache.avro.file.DataFileStream)3 Person (org.apache.crunch.test.Person)3 Test (org.junit.Test)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 Random (java.util.Random)2 ThreadLocalRandom (java.util.concurrent.ThreadLocalRandom)2 CodecFactory (org.apache.avro.file.CodecFactory)2