Search in sources :

Example 56 with GenericRecordBuilder

use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.

the class ETLTPFSTestRun method testAvroSourceConversionToAvroSink.

@Test
public void testAvroSourceConversionToAvroSink() throws Exception {
    Schema eventSchema = Schema.recordOf("record", Schema.Field.of("intVar", Schema.of(Schema.Type.INT)));
    org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(eventSchema.toString());
    GenericRecord record = new GenericRecordBuilder(avroSchema).set("intVar", Integer.MAX_VALUE).build();
    String filesetName = "tpfs";
    addDatasetInstance(TimePartitionedFileSet.class.getName(), filesetName, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setInputProperty("schema", avroSchema.toString()).setOutputProperty("schema", avroSchema.toString()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", (avroSchema.toString())).build());
    DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset(filesetName);
    TimePartitionedFileSet tpfs = fileSetManager.get();
    long timeInMillis = System.currentTimeMillis();
    fileSetManager.get().addPartition(timeInMillis, "directory", ImmutableMap.of("key1", "value1"));
    Location location = fileSetManager.get().getPartitionByTime(timeInMillis).getLocation();
    fileSetManager.flush();
    location = location.append("file.avro");
    FSDataOutputStream outputStream = new FSDataOutputStream(location.getOutputStream(), null);
    DataFileWriter dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(avroSchema));
    dataFileWriter.create(avroSchema, outputStream);
    dataFileWriter.append(record);
    dataFileWriter.flush();
    String newFilesetName = filesetName + "_op";
    ETLBatchConfig etlBatchConfig = constructTPFSETLConfig(filesetName, newFilesetName, eventSchema, "Snappy");
    ApplicationManager appManager = deployETL(etlBatchConfig, "sconversion1");
    // add a minute to the end time to make sure the newly added partition is included in the run.
    runETLOnce(appManager, ImmutableMap.of("logical.start.time", String.valueOf(timeInMillis + 60 * 1000)));
    DataSetManager<TimePartitionedFileSet> newFileSetManager = getDataset(newFilesetName);
    TimePartitionedFileSet newFileSet = newFileSetManager.get();
    List<GenericRecord> newRecords = readOutput(newFileSet, eventSchema);
    Assert.assertEquals(1, newRecords.size());
    Assert.assertEquals(Integer.MAX_VALUE, newRecords.get(0).get("intVar"));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Schema(io.cdap.cdap.api.data.schema.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) AvroKeyOutputFormat(org.apache.avro.mapreduce.AvroKeyOutputFormat) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) GenericRecord(org.apache.avro.generic.GenericRecord) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 57 with GenericRecordBuilder

use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.

the class FileBatchSourceTest method testFileBatchInputFormatParquetNullSchema.

@Test
public void testFileBatchInputFormatParquetNullSchema() throws Exception {
    File fileParquet = new File(temporaryFolder.newFolder(), "test.parquet");
    String outputDatasetName = "test-filesource-parquet-null-schema";
    String appName = "FileSourceParquetNullSchema";
    Schema recordSchemaWithMissingField = Schema.recordOf("record", Schema.Field.of("i", Schema.of(Schema.Type.INT)), Schema.Field.of("l", Schema.of(Schema.Type.LONG)), Schema.Field.of("file", Schema.of(Schema.Type.STRING)));
    org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(recordSchemaWithMissingField.toString());
    GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.getAbsoluteFile()).build();
    DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
    ParquetWriter<GenericRecord> parquetWriter = new AvroParquetWriter<>(new Path(fileParquet.getAbsolutePath()), avroSchema);
    parquetWriter.write(record);
    parquetWriter.close();
    inputManager.flush();
    ApplicationManager appManager = createSourceAndDeployApp(appName, fileParquet, "parquet", outputDatasetName, null);
    appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(RECORD_SCHEMA).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.toURI().toString()).build());
    DataSetManager<Table> outputManager = getDataset(outputDatasetName);
    List<StructuredRecord> output = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, output);
}
Also used : Path(org.apache.hadoop.fs.Path) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) File(java.io.File) Test(org.junit.Test)

Example 58 with GenericRecordBuilder

use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.

the class FileBatchSourceTest method testFileBatchInputFormatAvro.

@Test
public void testFileBatchInputFormatAvro() throws Exception {
    File fileAvro = new File(temporaryFolder.newFolder(), "test.avro");
    String outputDatasetName = "test-filesource-avro";
    String appName = "FileSourceAvro";
    ApplicationManager appManager = createSourceAndDeployApp(appName, fileAvro, "avro", outputDatasetName, RECORD_SCHEMA);
    org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(RECORD_SCHEMA.toString());
    GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileAvro.getAbsolutePath()).build();
    DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(avroSchema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(avroSchema, fileAvro);
    dataFileWriter.append(record);
    dataFileWriter.close();
    inputManager.flush();
    appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(RECORD_SCHEMA).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileAvro.toURI().toString()).build());
    DataSetManager<Table> outputManager = getDataset(outputDatasetName);
    List<StructuredRecord> output = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, output);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) File(java.io.File) Test(org.junit.Test)

Example 59 with GenericRecordBuilder

use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.

the class FileBatchSourceTest method testFileBatchInputFormatParquet.

@Test
public void testFileBatchInputFormatParquet() throws Exception {
    File fileParquet = new File(temporaryFolder.newFolder(), "test.parquet");
    String outputDatasetName = "test-filesource-parquet";
    String appName = "FileSourceParquet";
    ApplicationManager appManager = createSourceAndDeployApp(appName, fileParquet, "parquet", outputDatasetName, RECORD_SCHEMA);
    org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(RECORD_SCHEMA.toString());
    GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.getAbsolutePath()).build();
    DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
    ParquetWriter<GenericRecord> parquetWriter = new AvroParquetWriter<>(new Path(fileParquet.getAbsolutePath()), avroSchema);
    parquetWriter.write(record);
    parquetWriter.close();
    inputManager.flush();
    appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(RECORD_SCHEMA).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.toURI().toString()).build());
    DataSetManager<Table> outputManager = getDataset(outputDatasetName);
    List<StructuredRecord> output = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, output);
    // verify that the external dataset has the given schema
    verifyDatasetSchema(appName + "TestFile", RECORD_SCHEMA);
}
Also used : Path(org.apache.hadoop.fs.Path) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) File(java.io.File) Test(org.junit.Test)

Example 60 with GenericRecordBuilder

use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.

the class FileBatchSourceTest method testFileBatchInputFormatParquetMissingField.

@Test
public void testFileBatchInputFormatParquetMissingField() throws Exception {
    File fileParquet = new File(temporaryFolder.newFolder(), "test.parquet");
    String outputDatasetName = "test-filesource-parquet-missing-field";
    Schema recordSchemaWithMissingField = Schema.recordOf("record", Schema.Field.of("i", Schema.of(Schema.Type.INT)), Schema.Field.of("file", Schema.of(Schema.Type.STRING)));
    String appName = "FileSourceParquetMissingField";
    ApplicationManager appManager = createSourceAndDeployApp(appName, fileParquet, "parquet", outputDatasetName, recordSchemaWithMissingField);
    org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(RECORD_SCHEMA.toString());
    GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.getAbsolutePath()).build();
    DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
    ParquetWriter<GenericRecord> parquetWriter = new AvroParquetWriter<>(new Path(fileParquet.getAbsolutePath()), avroSchema);
    parquetWriter.write(record);
    parquetWriter.close();
    inputManager.flush();
    appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(recordSchemaWithMissingField).set("i", Integer.MAX_VALUE).set("file", fileParquet.toURI().toString()).build());
    DataSetManager<Table> outputManager = getDataset(outputDatasetName);
    List<StructuredRecord> output = MockSink.readOutput(outputManager);
    Assert.assertEquals(expected, output);
}
Also used : Path(org.apache.hadoop.fs.Path) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) File(java.io.File) Test(org.junit.Test)

Aggregations

GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)399 GenericRecord (org.apache.avro.generic.GenericRecord)263 Test (org.junit.Test)263 Schema (org.apache.avro.Schema)216 GenericData (org.apache.avro.generic.GenericData)69 ArrayList (java.util.ArrayList)45 EnumTest (foo.bar.EnumTest)41 File (java.io.File)41 IndexedRecord (org.apache.avro.generic.IndexedRecord)39 Schema (org.apache.kafka.connect.data.Schema)39 SchemaAndValue (org.apache.kafka.connect.data.SchemaAndValue)35 Path (org.apache.hadoop.fs.Path)33 List (java.util.List)30 ByteBuffer (java.nio.ByteBuffer)29 HashMap (java.util.HashMap)29 AvroSchema (io.confluent.kafka.schemaregistry.avro.AvroSchema)28 Struct (org.apache.kafka.connect.data.Struct)28 ByteArrayOutputStream (java.io.ByteArrayOutputStream)27 Record (org.apache.avro.generic.GenericData.Record)25 SchemaBuilder (org.apache.avro.SchemaBuilder)22