Search in sources :

Example 46 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project incubator-gobblin by apache.

the class AvroTestTools method writeAsAvroBinary.

private void writeAsAvroBinary(Iterator<GenericRecord> input, Schema schema, FileSystem fs, Path outputPath) throws IOException {
    DataFileWriter writer = new DataFileWriter(new GenericDatumWriter());
    writer.create(schema, fs.create(outputPath, true));
    while (input.hasNext()) {
        writer.append(input.next());
    }
    writer.close();
    log.info("Successfully wrote avro file to path " + outputPath);
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter)

Example 47 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project incubator-gobblin by apache.

the class CompactionGMCEPublishingActionTest method createAvroFileWithRepeatingRecords.

public void createAvroFileWithRepeatingRecords(File file, GenericRecord r, int count, Optional<Schema> schema) throws IOException {
    DataFileWriter<GenericRecord> writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>());
    writer.create(schema.isPresent() ? schema.get() : getSchema(), new FileOutputStream(file));
    for (int i = 0; i < count; ++i) {
        writer.append(r);
    }
    writer.close();
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) FileOutputStream(java.io.FileOutputStream) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 48 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project incubator-gobblin by apache.

the class AvroCompactionTaskTest method createAvroFileWithRepeatingRecords.

private void createAvroFileWithRepeatingRecords(File file, GenericRecord r, int count, Optional<Schema> schema) throws IOException {
    DataFileWriter<GenericRecord> writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>());
    writer.create(schema.isPresent() ? schema.get() : getSchema(), new FileOutputStream(file));
    for (int i = 0; i < count; ++i) {
        writer.append(r);
    }
    writer.close();
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) FileOutputStream(java.io.FileOutputStream) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 49 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project incubator-gobblin by apache.

the class AvroRecursionEliminatingConverterTest method generateRecord.

public File generateRecord() throws IOException {
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/recursive.avsc"));
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(inputSchema);
    GenericRecord record = new GenericData.Record(inputSchema);
    record.put("name", "John");
    record.put("date_of_birth", 1234L);
    record.put("last_modified", 4567L);
    record.put("created", 6789L);
    GenericRecord addressRecord = new GenericData.Record(inputSchema.getField("address").schema());
    addressRecord.put("city", "Los Angeles");
    addressRecord.put("street_number", 1234);
    GenericRecord innerAddressRecord = new GenericData.Record(inputSchema.getField("address").schema());
    innerAddressRecord.put("city", "San Francisco");
    innerAddressRecord.put("street_number", 3456);
    addressRecord.put("previous_address", innerAddressRecord);
    record.put("address", addressRecord);
    File recordFile = File.createTempFile(this.getClass().getSimpleName(), "avsc");
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    dataFileWriter.create(inputSchema, recordFile);
    dataFileWriter.append(record);
    dataFileWriter.close();
    recordFile.deleteOnExit();
    return recordFile;
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericRecord(org.apache.avro.generic.GenericRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 50 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project incubator-gobblin by apache.

the class EmbeddedGobblinDistcpTest method testCheckSchema.

@Test
public void testCheckSchema() throws Exception {
    Schema schema = null;
    try (InputStream is = GobblinMetricsPinotFlattenerConverter.class.getClassLoader().getResourceAsStream("avroSchemaManagerTest/expectedSchema.avsc")) {
        schema = new Schema.Parser().parse(is);
    } catch (IOException e) {
        e.printStackTrace();
    }
    String fileName = "file.avro";
    File tmpSource = Files.createTempDir();
    tmpSource.deleteOnExit();
    File tmpTarget = Files.createTempDir();
    tmpTarget.deleteOnExit();
    File tmpFile = new File(tmpSource, fileName);
    tmpFile.createNewFile();
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(schema, tmpFile);
    for (int i = 0; i < 100; i++) {
        GenericRecord record = new GenericData.Record(schema);
        record.put("foo", i);
        dataFileWriter.append(record);
    }
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertFalse(new File(tmpTarget, fileName).exists());
    EmbeddedGobblinDistcp embedded = new EmbeddedGobblinDistcp(new Path(tmpSource.getAbsolutePath()), new Path(tmpTarget.getAbsolutePath()));
    embedded.setConfiguration(CopySource.SCHEMA_CHECK_ENABLED, "true");
    embedded.setLaunchTimeout(30, TimeUnit.SECONDS);
    embedded.setConfiguration(ConfigurationKeys.SOURCE_CLASS_KEY, SchemaCheckedCopySource.class.getName());
    embedded.setConfiguration(ConfigurationKeys.AVRO_SCHEMA_CHECK_STRATEGY, "org.apache.gobblin.util.schema_check.AvroSchemaCheckDefaultStrategy");
    // test when schema is not the expected one, the job will be aborted.
    embedded.setConfiguration(ConfigurationKeys.COPY_EXPECTED_SCHEMA, "{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo1\",\"type\":[\"null\",\"int\"],\"doc\":\"this is for test\",\"default\":null}]}");
    JobExecutionResult result = embedded.run();
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertFalse(result.isSuccessful());
    Assert.assertFalse(new File(tmpTarget, fileName).exists());
    embedded.setConfiguration(ConfigurationKeys.COPY_EXPECTED_SCHEMA, "{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"string\",\"int\"],\"doc\":\"this is for test\",\"default\":null}]}");
    result = embedded.run();
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertFalse(result.isSuccessful());
    Assert.assertFalse(new File(tmpTarget, fileName).exists());
    // test when schema is the expected one, the job will succeed.
    embedded.setConfiguration(ConfigurationKeys.COPY_EXPECTED_SCHEMA, "{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"null\",\"int\"],\"doc\":\"this is for test\",\"default\":null}]}");
    result = embedded.run();
    Assert.assertTrue(result.isSuccessful());
    Assert.assertTrue(new File(tmpSource, fileName).exists());
    Assert.assertTrue(new File(tmpTarget, fileName).exists());
}
Also used : Path(org.apache.hadoop.fs.Path) InputStream(java.io.InputStream) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) IOException(java.io.IOException) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) JobExecutionResult(org.apache.gobblin.runtime.api.JobExecutionResult) GobblinMetricsPinotFlattenerConverter(org.apache.gobblin.converter.GobblinMetricsPinotFlattenerConverter) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) SchemaCheckedCopySource(org.apache.gobblin.data.management.copy.SchemaCheckedCopySource) Test(org.testng.annotations.Test)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)102 GenericRecord (org.apache.avro.generic.GenericRecord)58 Schema (org.apache.avro.Schema)50 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)47 File (java.io.File)38 ByteArrayOutputStream (java.io.ByteArrayOutputStream)22 IOException (java.io.IOException)22 GenericData (org.apache.avro.generic.GenericData)17 FileOutputStream (java.io.FileOutputStream)15 Test (org.junit.Test)14 HashMap (java.util.HashMap)11 InputStream (java.io.InputStream)10 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)10 ArrayList (java.util.ArrayList)9 Path (org.apache.hadoop.fs.Path)9 ByteArrayInputStream (java.io.ByteArrayInputStream)8 OutputStream (java.io.OutputStream)8 ByteBuffer (java.nio.ByteBuffer)7 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)7 MockFlowFile (org.apache.nifi.util.MockFlowFile)7