Search in sources :

Example 41 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project pinot by linkedin.

the class DerivedColumnNoTransformationTest method generateTestData.

private List<GenericRecord> generateTestData() throws Exception {
    Schema schema = new Schema.Parser().parse(ClassLoader.getSystemResourceAsStream(AVRO_SCHEMA));
    List<GenericRecord> inputRecords = new ArrayList<GenericRecord>();
    GenericRecord input = new GenericData.Record(schema);
    input.put("d1", "abc1");
    input.put("d2", "pqr1");
    input.put("d3", "xyz1");
    input.put("hoursSinceEpoch", generateRandomHoursSinceEpoch());
    input.put("m1", 10);
    input.put("m2", 20);
    inputRecords.add(input);
    input = new GenericData.Record(schema);
    input.put("d1", "abc2");
    input.put("d2", "pqr2");
    input.put("d3", "xyz2");
    input.put("hoursSinceEpoch", generateRandomHoursSinceEpoch());
    input.put("m1", 10);
    input.put("m2", 20);
    inputRecords.add(input);
    return inputRecords;
}
Also used : Record(org.apache.avro.generic.GenericData.Record) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Record(org.apache.avro.generic.GenericData.Record) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) GenericData(org.apache.avro.generic.GenericData)

Example 42 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project rest.li by linkedin.

the class DataTranslator method dataMapToGenericRecord.

/**
   * Convert the given {@link DataMap} conforming to the provided {@link RecordDataSchema}
   * to a {@link GenericRecord} with the provided Avro {@link Schema}.
   *
   * <p>
   * The provided Avro {@link Schema} should be generated from a record schema that
   * is compatible with the provided {@link RecordDataSchema} using {@link SchemaTranslator}.
   * If this is not the case, then data translation is likely to fail.
   *
   * @param map provides the {@link DataMap} to translate.
   * @param dataSchema provides the {@link RecordDataSchema} for the {@link DataMap}.
   * @param avroSchema the Avro {@link Schema} for the resulting {@link GenericRecord}.
   * @return a translated {@link GenericRecord}.
   * @throws DataTranslationException if there are errors that prevent translation.
   */
public static GenericRecord dataMapToGenericRecord(DataMap map, RecordDataSchema dataSchema, Schema avroSchema) throws DataTranslationException {
    DataMapToGenericRecordTranslator translator = new DataMapToGenericRecordTranslator();
    try {
        GenericRecord avroRecord = (GenericRecord) translator.translate(map, dataSchema, avroSchema);
        translator.checkMessageListForErrorsAndThrowDataTranslationException();
        return avroRecord;
    } catch (RuntimeException e) {
        throw translator.dataTranslationException(e);
    }
}
Also used : GenericRecord(org.apache.avro.generic.GenericRecord)

Example 43 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project rest.li by linkedin.

the class TestFilteredSchemaDataTranslation method testFilteredAvroSchemaDataTranslation.

/**
   * Removed derived field from Avro schema.
   */
@Test
public void testFilteredAvroSchemaDataTranslation() throws IOException {
    Object[][] inputs = { { "{ " + "  \"type\" : \"record\", " + "  \"name\" : \"Foo\", " + "  \"fields\" : [ " + "    { \"name\" : \"a\", \"type\" : \"int\" }, " + "    { \"name\" : \"b\", \"type\" : \"int\", \"optional\" : true }, " + "    { \"name\" : \"c\", \"type\" : \"int\", \"optional\" : true, \"derived\" : true } " + "  ] " + "}", Predicates.hasChildWithNameValue("derived", true), "{ " + "  \"type\" : \"record\", " + "  \"name\" : \"Foo\", " + "  \"fields\" : [ " + "    { \"name\" : \"a\", \"type\" : \"int\" }, " + "    { \"name\" : \"b\", \"type\" : [ \"null\", \"int\" ], \"default\" : null } " + "  ] " + "}", // "c" is dropped from output because it is not in the output schema
    "{ \"a\" : 1, \"b\" : 2, \"c\" : 3 }", "{ \"a\" : 1, \"b\" : { \"int\" : 2 } }", // "b" is translated to null and "c" is dropped from output because it is not in the output schema
    "{ \"a\" : 1, \"c\" : 3 }", "{ \"a\" : 1, \"b\" : null }" } };
    for (Object[] row : inputs) {
        int i = 0;
        String schemaText = (String) row[i++];
        Predicate predicate = (Predicate) row[i++];
        String avroSchemaText = (String) row[i++];
        RecordDataSchema schema = (RecordDataSchema) TestUtil.dataSchemaFromString(schemaText);
        NamedDataSchema filteredSchema = Filters.removeByPredicate(schema, predicate, new SchemaParser());
        Schema filteredAvroSchema = SchemaTranslator.dataToAvroSchema(filteredSchema);
        Schema expectedAvroSchema = Schema.parse(avroSchemaText);
        assertEquals(filteredAvroSchema, expectedAvroSchema);
        while (i < row.length) {
            String translationSourceJson = (String) row[i++];
            String translationResultJson = (String) row[i++];
            DataMap dataMap = TestUtil.dataMapFromString(translationSourceJson);
            GenericRecord genericRecord = DataTranslator.dataMapToGenericRecord(dataMap, schema, filteredAvroSchema);
            String avroJson = AvroUtil.jsonFromGenericRecord(genericRecord);
            DataMap avroJsonAsDataMap = TestUtil.dataMapFromString(avroJson);
            assertEquals(avroJsonAsDataMap, TestUtil.dataMapFromString(translationResultJson));
        }
    }
}
Also used : NamedDataSchema(com.linkedin.data.schema.NamedDataSchema) RecordDataSchema(com.linkedin.data.schema.RecordDataSchema) Schema(org.apache.avro.Schema) DataSchema(com.linkedin.data.schema.DataSchema) RecordDataSchema(com.linkedin.data.schema.RecordDataSchema) NamedDataSchema(com.linkedin.data.schema.NamedDataSchema) SchemaParser(com.linkedin.data.schema.SchemaParser) GenericRecord(org.apache.avro.generic.GenericRecord) Predicate(com.linkedin.data.it.Predicate) DataMap(com.linkedin.data.DataMap) Test(org.testng.annotations.Test)

Example 44 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project rest.li by linkedin.

the class TestFilteredSchemaDataTranslation method testFilteredDataSchemaDataTranslation.

/**
   * Removed field from Pegasus schema.
   */
@Test
public void testFilteredDataSchemaDataTranslation() throws IOException {
    Object[][] inputs = { { "{ " + "  \"type\" : \"record\", " + "  \"name\" : \"Foo\", " + "  \"fields\" : [ " + "    { \"name\" : \"a\", \"type\" : \"int\" }, " + "    { \"name\" : \"b\", \"type\" : [ \"null\", \"int\" ], \"default\" : null }, " + "    { \"name\" : \"removeMe\", \"type\" : \"int\" } " + "  ] " + "}", Predicates.hasChildWithNameValue("name", "removeMe"), "{ " + "  \"type\" : \"record\", " + "  \"name\" : \"Foo\", " + "  \"fields\" : [ " + "    { \"name\" : \"a\", \"type\" : \"int\" }, " + "    { \"name\" : \"b\", \"type\" : \"int\", \"optional\" : true } " + "  ] " + "}", // "removeMe" is dropped from output because it is not in output schema
    "{ \"a\" : 1, \"b\" : { \"int\" : 2 }, \"removeMe\" : 3 }", "{ \"a\" : 1, \"b\" : 2 }", // "b" has null value is dropped from output, "removeMe" is dropped from output because it is not in output schema
    "{ \"a\" : 1, \"b\" : null, \"removeMe\" : 3 }", "{ \"a\" : 1 }" } };
    for (Object[] row : inputs) {
        int i = 0;
        String avroSchemaText = (String) row[i++];
        Predicate predicate = (Predicate) row[i++];
        String schemaText = (String) row[i++];
        Schema avroSchema = Schema.parse(avroSchemaText);
        System.out.println(avroSchema);
        RecordDataSchema schema = (RecordDataSchema) SchemaTranslator.avroToDataSchema(avroSchema);
        RecordDataSchema filteredSchema = (RecordDataSchema) Filters.removeByPredicate(schema, predicate, new SchemaParser());
        DataSchema expectedSchema = TestUtil.dataSchemaFromString(schemaText);
        System.out.println(filteredSchema);
        assertEquals(filteredSchema, expectedSchema);
        while (i < row.length) {
            String translationSourceJson = (String) row[i++];
            String translationExpectedJson = (String) row[i++];
            GenericRecord genericRecord = AvroUtil.genericRecordFromJson(translationSourceJson, avroSchema);
            DataMap dataMap = DataTranslator.genericRecordToDataMap(genericRecord, filteredSchema, avroSchema);
            assertEquals(dataMap, TestUtil.dataMapFromString(translationExpectedJson));
        }
    }
}
Also used : DataSchema(com.linkedin.data.schema.DataSchema) RecordDataSchema(com.linkedin.data.schema.RecordDataSchema) NamedDataSchema(com.linkedin.data.schema.NamedDataSchema) Schema(org.apache.avro.Schema) DataSchema(com.linkedin.data.schema.DataSchema) RecordDataSchema(com.linkedin.data.schema.RecordDataSchema) NamedDataSchema(com.linkedin.data.schema.NamedDataSchema) RecordDataSchema(com.linkedin.data.schema.RecordDataSchema) SchemaParser(com.linkedin.data.schema.SchemaParser) GenericRecord(org.apache.avro.generic.GenericRecord) Predicate(com.linkedin.data.it.Predicate) DataMap(com.linkedin.data.DataMap) Test(org.testng.annotations.Test)

Example 45 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project rest.li by linkedin.

the class TestSchemaTranslator method testUnionDefaultValues.

@Test
public void testUnionDefaultValues() throws IOException {
    boolean debug = true;
    final String emptySchemaText = "{ " + "  \"type\" : \"record\", " + "  \"name\" : \"foo\", " + "  \"fields\" : [] " + "}";
    final Schema emptySchema = Schema.parse(emptySchemaText);
    final String emptyRecord = "{}";
    final String[] input = { "{ " + "  \"type\" : \"record\", " + "  \"name\" : \"foo\", " + "  \"fields\" : [ " + "    { " + "      \"name\" : \"f1\", " + "      \"type\" : [ \"int\", \"null\" ], " + "      \"default\" : 42 " + "    }, " + "    { " + "      \"name\" : \"f2\", " + "      \"type\" : { " + "        \"type\" : \"record\", " + "        \"name\" : \"bar\", " + "        \"fields\" : [ " + "          { " + "            \"name\" : \"b1\", \"type\" : [ \"string\", \"null\" ] " + "          } " + "        ] " + "      }, " + "      \"default\" : { \"b1\" : \"abc\" } " + "    } " + "  ] " + "}", "{ " + "  \"type\" : \"record\", " + "  \"name\" : \"foo\", " + "  \"fields\" : [ " + "    { " + "      \"name\" : \"f1\", " + "      \"type\" : [ \"int\", \"null\" ], " + "      \"default\" : 42 " + "    }, " + "    { " + "      \"name\" : \"f2\", " + "      \"type\" : { " + "        \"type\" : \"record\", " + "        \"name\" : \"bar\", " + "        \"fields\" : [ " + "          { " + "            \"name\" : \"b1\", \"type\" : [ \"string\", \"null\" ], \"default\" : \"abc\" " + "          } " + "        ] " + "      }, " + "      \"default\" : { } " + "    } " + "  ] " + "}" };
    for (String readerSchemaText : input) {
        final Schema readerSchema = Schema.parse(readerSchemaText);
        GenericRecord record = genericRecordFromString(emptyRecord, emptySchema, readerSchema);
        if (debug)
            System.out.println(record);
        SchemaParser parser = new SchemaParser();
        parser.getValidationOptions().setAvroUnionMode(true);
        parser.parse(readerSchemaText);
        if (debug)
            System.out.println(parser.errorMessage());
        assertFalse(parser.hasError());
    }
}
Also used : DataSchema(com.linkedin.data.schema.DataSchema) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) SchemaParser(com.linkedin.data.schema.SchemaParser) PegasusSchemaParser(com.linkedin.data.schema.PegasusSchemaParser) Test(org.testng.annotations.Test)

Aggregations

GenericRecord (org.apache.avro.generic.GenericRecord)262 Schema (org.apache.avro.Schema)101 Test (org.junit.Test)80 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)46 File (java.io.File)35 IOException (java.io.IOException)34 GenericData (org.apache.avro.generic.GenericData)30 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)30 ArrayList (java.util.ArrayList)29 ByteArrayOutputStream (java.io.ByteArrayOutputStream)27 DataFileWriter (org.apache.avro.file.DataFileWriter)20 HashMap (java.util.HashMap)19 ByteBuffer (java.nio.ByteBuffer)18 BinaryEncoder (org.apache.avro.io.BinaryEncoder)17 Field (org.apache.avro.Schema.Field)14 DataFileStream (org.apache.avro.file.DataFileStream)14 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)14 Utf8 (org.apache.avro.util.Utf8)14 Encoder (org.apache.avro.io.Encoder)12 DatasetRepository (com.cloudera.cdk.data.DatasetRepository)11