Search in sources :

Example 86 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project drill by apache.

the class AvroTestUtil method generateLinkedList.

public static String generateLinkedList() throws Exception {
    final File file = File.createTempFile("avro-linkedlist", ".avro");
    file.deleteOnExit();
    final Schema schema = SchemaBuilder.record("LongList").namespace("org.apache.drill.exec.store.avro").aliases("LinkedLongs").fields().name("value").type().optional().longType().name("next").type().optional().type("LongList").endRecord();
    final DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(schema));
    writer.create(schema, file);
    GenericRecord previousRecord = null;
    try {
        for (int i = 0; i < RECORD_COUNT; i++) {
            GenericRecord record = (GenericRecord) (previousRecord == null ? new GenericData.Record(schema) : previousRecord.get("next"));
            record.put("value", (long) i);
            if (previousRecord != null) {
                writer.append(previousRecord);
            }
            GenericRecord nextRecord = new GenericData.Record(record.getSchema());
            record.put("next", nextRecord);
            previousRecord = record;
        }
        writer.append(previousRecord);
    } finally {
        writer.close();
    }
    return file.getAbsolutePath();
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) GenericData(org.apache.avro.generic.GenericData)

Example 87 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project drill by apache.

the class AvroTestUtil method generateNestedArraySchema.

public static AvroTestRecordWriter generateNestedArraySchema(int numRecords, int numArrayItems) throws IOException {
    final File file = File.createTempFile("avro-nested-test", ".avro");
    file.deleteOnExit();
    final Schema schema = SchemaBuilder.record("AvroRecordReaderTest").namespace("org.apache.drill.exec.store.avro").fields().name("a_int").type().intType().noDefault().name("b_array").type().array().items().record("my_record_1").namespace("foo.blah.org").fields().name("nested_1_int").type().optional().intType().endRecord().arrayDefault(Collections.emptyList()).endRecord();
    final Schema arraySchema = schema.getField("b_array").schema();
    final Schema itemSchema = arraySchema.getElementType();
    final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
    try {
        for (int i = 0; i < numRecords; i++) {
            record.startRecord();
            record.put("a_int", i);
            GenericArray<GenericRecord> array = new GenericData.Array<>(ARRAY_SIZE, arraySchema);
            for (int j = 0; j < numArrayItems; j++) {
                final GenericRecord nestedRecord = new GenericData.Record(itemSchema);
                nestedRecord.put("nested_1_int", j);
                array.add(nestedRecord);
            }
            record.put("b_array", array);
            record.endRecord();
        }
    } finally {
        record.close();
    }
    return record;
}
Also used : GenericArray(org.apache.avro.generic.GenericArray) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 88 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project drill by apache.

the class AvroTestUtil method generateSimpleNestedSchema_NoNullValues.

public static AvroTestRecordWriter generateSimpleNestedSchema_NoNullValues() throws Exception {
    final File file = File.createTempFile("avro-nested-test", ".avro");
    file.deleteOnExit();
    final Schema schema = SchemaBuilder.record("AvroRecordReaderTest").namespace("org.apache.drill.exec.store.avro").fields().name("a_string").type().stringType().noDefault().name("b_int").type().intType().noDefault().name("c_record").type().record("my_record_1").namespace("foo.blah.org").fields().name("nested_1_string").type().stringType().noDefault().name("nested_1_int").type().intType().noDefault().endRecord().noDefault().endRecord();
    final Schema nestedSchema = schema.getField("c_record").schema();
    final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
    try {
        for (int i = 0; i < RECORD_COUNT; i++) {
            record.startRecord();
            record.put("a_string", "a_" + i);
            record.put("b_int", i);
            final GenericRecord nestedRecord = new GenericData.Record(nestedSchema);
            nestedRecord.put("nested_1_string", "nested_1_string_" + i);
            nestedRecord.put("nested_1_int", i * i);
            record.put("c_record", nestedRecord);
            record.endRecord();
        }
    } finally {
        record.close();
    }
    return record;
}
Also used : Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 89 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project drill by apache.

the class AvroTestUtil method generateUnionNestedArraySchema_withNullValues.

public static AvroTestRecordWriter generateUnionNestedArraySchema_withNullValues() throws Exception {
    final File file = File.createTempFile("avro-nested-test", ".avro");
    file.deleteOnExit();
    final Schema schema = SchemaBuilder.record("AvroRecordReaderTest").namespace("org.apache.drill.exec.store.avro").fields().name("a_string").type().stringType().noDefault().name("b_int").type().intType().noDefault().name("c_array").type().optional().array().items().record("my_record_1").namespace("foo.blah.org").fields().name("nested_1_string").type().optional().stringType().name("nested_1_int").type().optional().intType().endRecord().endRecord();
    final Schema nestedSchema = schema.getField("c_array").schema();
    final Schema arraySchema = nestedSchema.getTypes().get(1);
    final Schema itemSchema = arraySchema.getElementType();
    final AvroTestRecordWriter record = new AvroTestRecordWriter(schema, file);
    try {
        for (int i = 0; i < RECORD_COUNT; i++) {
            record.startRecord();
            record.put("a_string", "a_" + i);
            record.put("b_int", i);
            if (i % 2 == 0) {
                GenericArray<GenericRecord> array = new GenericData.Array<>(1, arraySchema);
                final GenericRecord nestedRecord = new GenericData.Record(itemSchema);
                nestedRecord.put("nested_1_string", "nested_1_string_" + i);
                nestedRecord.put("nested_1_int", i * i);
                array.add(nestedRecord);
                record.put("c_array", array);
            }
            record.endRecord();
        }
    } finally {
        record.close();
    }
    return record;
}
Also used : GenericArray(org.apache.avro.generic.GenericArray) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 90 with GenericRecord

use of org.apache.avro.generic.GenericRecord in project samza by apache.

the class TestHdfsSystemConsumer method testHdfsSystemConsumerE2E.

/*
   * A simple end to end test that covers the workflow from system admin to
   * partitioner, system consumer, and so on, making sure the basic functionality
   * works as expected.
   */
@Test
public void testHdfsSystemConsumerE2E() throws Exception {
    Config config = generateDefaultConfig();
    HdfsSystemFactory systemFactory = new HdfsSystemFactory();
    // create admin and do partitioning
    HdfsSystemAdmin systemAdmin = systemFactory.getAdmin(SYSTEM_NAME, config);
    String streamName = WORKING_DIRECTORY;
    Set<String> streamNames = new HashSet<>();
    streamNames.add(streamName);
    generateAvroDataFiles();
    Map<String, SystemStreamMetadata> streamMetadataMap = systemAdmin.getSystemStreamMetadata(streamNames);
    SystemStreamMetadata systemStreamMetadata = streamMetadataMap.get(streamName);
    Assert.assertEquals(NUM_FILES, systemStreamMetadata.getSystemStreamPartitionMetadata().size());
    // create consumer and read from files
    HdfsSystemConsumer systemConsumer = systemFactory.getConsumer(SYSTEM_NAME, config, new NoOpMetricsRegistry());
    Map<Partition, SystemStreamMetadata.SystemStreamPartitionMetadata> metadataMap = systemStreamMetadata.getSystemStreamPartitionMetadata();
    Set<SystemStreamPartition> systemStreamPartitionSet = new HashSet<>();
    metadataMap.forEach((partition, metadata) -> {
        SystemStreamPartition ssp = new SystemStreamPartition(SYSTEM_NAME, streamName, partition);
        systemStreamPartitionSet.add(ssp);
        String offset = metadata.getOldestOffset();
        systemConsumer.register(ssp, offset);
    });
    systemConsumer.start();
    // verify events read from consumer
    int eventsReceived = 0;
    // one "End of Stream" event in the end
    int totalEvents = (NUM_EVENTS + 1) * NUM_FILES;
    int remainingRetires = 100;
    Map<SystemStreamPartition, List<IncomingMessageEnvelope>> overallResults = new HashMap<>();
    while (eventsReceived < totalEvents && remainingRetires > 0) {
        remainingRetires--;
        Map<SystemStreamPartition, List<IncomingMessageEnvelope>> result = systemConsumer.poll(systemStreamPartitionSet, 200);
        for (SystemStreamPartition ssp : result.keySet()) {
            List<IncomingMessageEnvelope> messageEnvelopeList = result.get(ssp);
            overallResults.putIfAbsent(ssp, new ArrayList<>());
            overallResults.get(ssp).addAll(messageEnvelopeList);
            if (overallResults.get(ssp).size() >= NUM_EVENTS + 1) {
                systemStreamPartitionSet.remove(ssp);
            }
            eventsReceived += messageEnvelopeList.size();
        }
    }
    Assert.assertEquals(eventsReceived, totalEvents);
    Assert.assertEquals(NUM_FILES, overallResults.size());
    overallResults.values().forEach(messages -> {
        Assert.assertEquals(NUM_EVENTS + 1, messages.size());
        for (int index = 0; index < NUM_EVENTS; index++) {
            GenericRecord record = (GenericRecord) messages.get(index).getMessage();
            Assert.assertEquals(index % NUM_EVENTS, record.get(FIELD_1));
            Assert.assertEquals("string_" + (index % NUM_EVENTS), record.get(FIELD_2).toString());
        }
        Assert.assertEquals(messages.get(NUM_EVENTS).getOffset(), IncomingMessageEnvelope.END_OF_STREAM_OFFSET);
    });
}
Also used : Partition(org.apache.samza.Partition) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) HashMap(java.util.HashMap) Config(org.apache.samza.config.Config) MapConfig(org.apache.samza.config.MapConfig) IncomingMessageEnvelope(org.apache.samza.system.IncomingMessageEnvelope) SystemStreamMetadata(org.apache.samza.system.SystemStreamMetadata) NoOpMetricsRegistry(org.apache.samza.util.NoOpMetricsRegistry) ArrayList(java.util.ArrayList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) HashSet(java.util.HashSet) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Test(org.junit.Test)

Aggregations

GenericRecord (org.apache.avro.generic.GenericRecord)262 Schema (org.apache.avro.Schema)101 Test (org.junit.Test)80 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)46 File (java.io.File)35 IOException (java.io.IOException)34 GenericData (org.apache.avro.generic.GenericData)30 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)30 ArrayList (java.util.ArrayList)29 ByteArrayOutputStream (java.io.ByteArrayOutputStream)27 DataFileWriter (org.apache.avro.file.DataFileWriter)20 HashMap (java.util.HashMap)19 ByteBuffer (java.nio.ByteBuffer)18 BinaryEncoder (org.apache.avro.io.BinaryEncoder)17 Field (org.apache.avro.Schema.Field)14 DataFileStream (org.apache.avro.file.DataFileStream)14 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)14 Utf8 (org.apache.avro.util.Utf8)14 Encoder (org.apache.avro.io.Encoder)12 DatasetRepository (com.cloudera.cdk.data.DatasetRepository)11