Search in sources :

Example 16 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroExternalTable method getSchemaFromAvroDataFile.

private Schema getSchemaFromAvroDataFile() throws IOException {
    String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs);
    LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath);
    SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput();
    try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) {
        Schema schema = dfr.getSchema();
        return schema;
    }
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) Schema(org.apache.avro.Schema) SeekableInput(org.apache.avro.file.SeekableInput)

Example 17 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroToRestJsonEntryConverterTest method testConversion.

private void testConversion(RestEntry<JsonObject> expected, WorkUnitState actualWorkUnitState) throws DataConversionException, IOException, JSONException {
    Schema schema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/nested.avsc"));
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
    File tmp = File.createTempFile(this.getClass().getSimpleName(), null);
    tmp.deleteOnExit();
    try {
        FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/nested.avro"), tmp);
        DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(tmp, datumReader);
        GenericRecord avroRecord = dataFileReader.next();
        AvroToRestJsonEntryConverter converter = new AvroToRestJsonEntryConverter();
        RestEntry<JsonObject> actual = converter.convertRecord(null, avroRecord, actualWorkUnitState).iterator().next();
        Assert.assertEquals(actual.getResourcePath(), expected.getResourcePath());
        JSONAssert.assertEquals(expected.getRestEntryVal().toString(), actual.getRestEntryVal().toString(), false);
        converter.close();
        dataFileReader.close();
    } finally {
        if (tmp != null) {
            tmp.delete();
        }
    }
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) JsonObject(com.google.gson.JsonObject) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 18 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroGenericRecordAccessorTest method updateRecordFromTestResource.

private void updateRecordFromTestResource(String resourceName, String avroFileName) throws IOException {
    if (avroFileName == null) {
        avroFileName = resourceName + ".avro";
    }
    recordSchema = new Schema.Parser().parse(getClass().getClassLoader().getResourceAsStream(resourceName + ".avsc"));
    DatumReader<GenericRecord> reader = new GenericDatumReader<>(recordSchema);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(getClass().getClassLoader().getResource(avroFileName).getPath()), reader);
    Assert.assertTrue(dataFileReader.hasNext());
    record = dataFileReader.next(record);
    accessor = new AvroGenericRecordAccessor(record);
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 19 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class FileAwareInputStreamExtractorWithCheckSchema method schemaChecking.

/**
 * Use {@link AvroSchemaCheckStrategy} to make sure the real schema and the expected schema have matching field names and types
 * @param fsFromFile
 * @return
 * @throws IOException
 */
protected boolean schemaChecking(FileSystem fsFromFile) throws IOException {
    if (!this.state.getPropAsBoolean(CopySource.SCHEMA_CHECK_ENABLED, CopySource.DEFAULT_SCHEMA_CHECK_ENABLED)) {
        return true;
    }
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader(new FsInput(this.file.getFileStatus().getPath(), new Configuration()), datumReader);
    Schema schema = dataFileReader.getSchema();
    if (this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA) == null) {
        throw new IOException("Expected schema is not set properly");
    }
    Schema expectedSchema = new Schema.Parser().parse(this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA));
    AvroSchemaCheckStrategy strategy = AvroSchemaCheckStrategy.AvroSchemaCheckStrategyFactory.create(this.state);
    if (strategy == null) {
        throw new IOException("schema check strategy cannot be initialized");
    }
    return strategy.compare(expectedSchema, schema);
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) FsInput(org.apache.avro.mapred.FsInput) Configuration(org.apache.hadoop.conf.Configuration) AvroSchemaCheckStrategy(org.apache.gobblin.util.schema_check.AvroSchemaCheckStrategy) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 20 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroStringFieldDecryptorConverterTest method getRecordFromFile.

private GenericRecord getRecordFromFile(String path) throws IOException {
    DatumReader<GenericRecord> reader = new GenericDatumReader<>();
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(path), reader);
    while (dataFileReader.hasNext()) {
        return dataFileReader.next();
    }
    return null;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Aggregations

DataFileReader (org.apache.avro.file.DataFileReader)46 GenericRecord (org.apache.avro.generic.GenericRecord)28 File (java.io.File)26 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)21 Schema (org.apache.avro.Schema)20 Test (org.junit.Test)10 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)8 Test (org.testng.annotations.Test)7 SeekableInput (org.apache.avro.file.SeekableInput)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)6 Configuration (org.apache.hadoop.conf.Configuration)6 ReflectDatumReader (org.apache.avro.reflect.ReflectDatumReader)5 SeekableByteArrayInput (org.apache.avro.file.SeekableByteArrayInput)4 FsInput (org.apache.avro.mapred.FsInput)4 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)4 Utf8 (org.apache.avro.util.Utf8)4 JsonObject (com.google.gson.JsonObject)2 AvroDag (edu.snu.mist.formats.avro.AvroDag)2 Date (java.sql.Date)2