use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.
the class ETLTPFSTestRun method testAvroSourceConversionToAvroSink.
@Test
public void testAvroSourceConversionToAvroSink() throws Exception {
Schema eventSchema = Schema.recordOf("record", Schema.Field.of("intVar", Schema.of(Schema.Type.INT)));
org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(eventSchema.toString());
GenericRecord record = new GenericRecordBuilder(avroSchema).set("intVar", Integer.MAX_VALUE).build();
String filesetName = "tpfs";
addDatasetInstance(TimePartitionedFileSet.class.getName(), filesetName, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setInputProperty("schema", avroSchema.toString()).setOutputProperty("schema", avroSchema.toString()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", (avroSchema.toString())).build());
DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset(filesetName);
TimePartitionedFileSet tpfs = fileSetManager.get();
long timeInMillis = System.currentTimeMillis();
fileSetManager.get().addPartition(timeInMillis, "directory", ImmutableMap.of("key1", "value1"));
Location location = fileSetManager.get().getPartitionByTime(timeInMillis).getLocation();
fileSetManager.flush();
location = location.append("file.avro");
FSDataOutputStream outputStream = new FSDataOutputStream(location.getOutputStream(), null);
DataFileWriter dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(avroSchema));
dataFileWriter.create(avroSchema, outputStream);
dataFileWriter.append(record);
dataFileWriter.flush();
String newFilesetName = filesetName + "_op";
ETLBatchConfig etlBatchConfig = constructTPFSETLConfig(filesetName, newFilesetName, eventSchema, "Snappy");
ApplicationManager appManager = deployETL(etlBatchConfig, "sconversion1");
// add a minute to the end time to make sure the newly added partition is included in the run.
runETLOnce(appManager, ImmutableMap.of("logical.start.time", String.valueOf(timeInMillis + 60 * 1000)));
DataSetManager<TimePartitionedFileSet> newFileSetManager = getDataset(newFilesetName);
TimePartitionedFileSet newFileSet = newFileSetManager.get();
List<GenericRecord> newRecords = readOutput(newFileSet, eventSchema);
Assert.assertEquals(1, newRecords.size());
Assert.assertEquals(Integer.MAX_VALUE, newRecords.get(0).get("intVar"));
}
use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.
the class FileBatchSourceTest method testFileBatchInputFormatParquetNullSchema.
@Test
public void testFileBatchInputFormatParquetNullSchema() throws Exception {
File fileParquet = new File(temporaryFolder.newFolder(), "test.parquet");
String outputDatasetName = "test-filesource-parquet-null-schema";
String appName = "FileSourceParquetNullSchema";
Schema recordSchemaWithMissingField = Schema.recordOf("record", Schema.Field.of("i", Schema.of(Schema.Type.INT)), Schema.Field.of("l", Schema.of(Schema.Type.LONG)), Schema.Field.of("file", Schema.of(Schema.Type.STRING)));
org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(recordSchemaWithMissingField.toString());
GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.getAbsoluteFile()).build();
DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
ParquetWriter<GenericRecord> parquetWriter = new AvroParquetWriter<>(new Path(fileParquet.getAbsolutePath()), avroSchema);
parquetWriter.write(record);
parquetWriter.close();
inputManager.flush();
ApplicationManager appManager = createSourceAndDeployApp(appName, fileParquet, "parquet", outputDatasetName, null);
appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(RECORD_SCHEMA).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.toURI().toString()).build());
DataSetManager<Table> outputManager = getDataset(outputDatasetName);
List<StructuredRecord> output = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, output);
}
use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.
the class FileBatchSourceTest method testFileBatchInputFormatAvro.
@Test
public void testFileBatchInputFormatAvro() throws Exception {
File fileAvro = new File(temporaryFolder.newFolder(), "test.avro");
String outputDatasetName = "test-filesource-avro";
String appName = "FileSourceAvro";
ApplicationManager appManager = createSourceAndDeployApp(appName, fileAvro, "avro", outputDatasetName, RECORD_SCHEMA);
org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(RECORD_SCHEMA.toString());
GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileAvro.getAbsolutePath()).build();
DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(avroSchema);
DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
dataFileWriter.create(avroSchema, fileAvro);
dataFileWriter.append(record);
dataFileWriter.close();
inputManager.flush();
appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(RECORD_SCHEMA).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileAvro.toURI().toString()).build());
DataSetManager<Table> outputManager = getDataset(outputDatasetName);
List<StructuredRecord> output = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, output);
}
use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.
the class FileBatchSourceTest method testFileBatchInputFormatParquet.
@Test
public void testFileBatchInputFormatParquet() throws Exception {
File fileParquet = new File(temporaryFolder.newFolder(), "test.parquet");
String outputDatasetName = "test-filesource-parquet";
String appName = "FileSourceParquet";
ApplicationManager appManager = createSourceAndDeployApp(appName, fileParquet, "parquet", outputDatasetName, RECORD_SCHEMA);
org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(RECORD_SCHEMA.toString());
GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.getAbsolutePath()).build();
DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
ParquetWriter<GenericRecord> parquetWriter = new AvroParquetWriter<>(new Path(fileParquet.getAbsolutePath()), avroSchema);
parquetWriter.write(record);
parquetWriter.close();
inputManager.flush();
appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(RECORD_SCHEMA).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.toURI().toString()).build());
DataSetManager<Table> outputManager = getDataset(outputDatasetName);
List<StructuredRecord> output = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, output);
// verify that the external dataset has the given schema
verifyDatasetSchema(appName + "TestFile", RECORD_SCHEMA);
}
use of org.apache.avro.generic.GenericRecordBuilder in project hydrator-plugins by cdapio.
the class FileBatchSourceTest method testFileBatchInputFormatParquetMissingField.
@Test
public void testFileBatchInputFormatParquetMissingField() throws Exception {
File fileParquet = new File(temporaryFolder.newFolder(), "test.parquet");
String outputDatasetName = "test-filesource-parquet-missing-field";
Schema recordSchemaWithMissingField = Schema.recordOf("record", Schema.Field.of("i", Schema.of(Schema.Type.INT)), Schema.Field.of("file", Schema.of(Schema.Type.STRING)));
String appName = "FileSourceParquetMissingField";
ApplicationManager appManager = createSourceAndDeployApp(appName, fileParquet, "parquet", outputDatasetName, recordSchemaWithMissingField);
org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(RECORD_SCHEMA.toString());
GenericRecord record = new GenericRecordBuilder(avroSchema).set("i", Integer.MAX_VALUE).set("l", Long.MAX_VALUE).set("file", fileParquet.getAbsolutePath()).build();
DataSetManager<TimePartitionedFileSet> inputManager = getDataset("TestFile");
ParquetWriter<GenericRecord> parquetWriter = new AvroParquetWriter<>(new Path(fileParquet.getAbsolutePath()), avroSchema);
parquetWriter.write(record);
parquetWriter.close();
inputManager.flush();
appManager.getWorkflowManager(SmartWorkflow.NAME).startAndWaitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
List<StructuredRecord> expected = ImmutableList.of(StructuredRecord.builder(recordSchemaWithMissingField).set("i", Integer.MAX_VALUE).set("file", fileParquet.toURI().toString()).build());
DataSetManager<Table> outputManager = getDataset(outputDatasetName);
List<StructuredRecord> output = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, output);
}
Aggregations