Search in sources :

Example 36 with DataFileReader

use of org.apache.avro.file.DataFileReader in project flink by apache.

the class AvroOutputFormatITCase method postSubmit.

@Override
protected void postSubmit() throws Exception {
    // compare result for specific user type
    File[] output1;
    File file1 = asFile(outputPath1);
    if (file1.isDirectory()) {
        output1 = file1.listFiles();
        // check for avro ext in dir.
        for (File avroOutput : Objects.requireNonNull(output1)) {
            Assert.assertTrue("Expect extension '.avro'", avroOutput.toString().endsWith(".avro"));
        }
    } else {
        output1 = new File[] { file1 };
    }
    List<String> result1 = new ArrayList<>();
    DatumReader<User> userDatumReader1 = new SpecificDatumReader<>(User.class);
    for (File avroOutput : output1) {
        DataFileReader<User> dataFileReader1 = new DataFileReader<>(avroOutput, userDatumReader1);
        while (dataFileReader1.hasNext()) {
            User user = dataFileReader1.next();
            result1.add(user.getName() + "|" + user.getFavoriteNumber() + "|" + user.getFavoriteColor());
        }
    }
    for (String expectedResult : userData.split("\n")) {
        Assert.assertTrue("expected user " + expectedResult + " not found.", result1.contains(expectedResult));
    }
    // compare result for reflect user type
    File[] output2;
    File file2 = asFile(outputPath2);
    if (file2.isDirectory()) {
        output2 = file2.listFiles();
    } else {
        output2 = new File[] { file2 };
    }
    List<String> result2 = new ArrayList<>();
    DatumReader<ReflectiveUser> userDatumReader2 = new ReflectDatumReader<>(ReflectiveUser.class);
    for (File avroOutput : Objects.requireNonNull(output2)) {
        DataFileReader<ReflectiveUser> dataFileReader2 = new DataFileReader<>(avroOutput, userDatumReader2);
        while (dataFileReader2.hasNext()) {
            ReflectiveUser user = dataFileReader2.next();
            result2.add(user.getName() + "|" + user.getFavoriteNumber() + "|" + user.getFavoriteColor());
        }
    }
    for (String expectedResult : userData.split("\n")) {
        Assert.assertTrue("expected user " + expectedResult + " not found.", result2.contains(expectedResult));
    }
}
Also used : User(org.apache.flink.formats.avro.generated.User) ArrayList(java.util.ArrayList) DataFileReader(org.apache.avro.file.DataFileReader) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ReflectDatumReader(org.apache.avro.reflect.ReflectDatumReader) File(java.io.File)

Example 37 with DataFileReader

use of org.apache.avro.file.DataFileReader in project flink by apache.

the class AvroInputFormat method initReader.

private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
    DatumReader<E> datumReader;
    if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
        datumReader = new GenericDatumReader<E>();
    } else {
        datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("Opening split {}", split);
    }
    SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
    DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
    }
    end = split.getStart() + split.getLength();
    recordsReadSinceLastSync = 0;
    return dataFileReader;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) SeekableInput(org.apache.avro.file.SeekableInput) FSDataInputStreamWrapper(org.apache.flink.formats.avro.utils.FSDataInputStreamWrapper) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) ReflectDatumReader(org.apache.avro.reflect.ReflectDatumReader)

Example 38 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroHdfsDataWriterTest method testWrite.

@Test
public void testWrite() throws IOException {
    // Write all test records
    for (String record : TestConstants.JSON_RECORDS) {
        this.writer.write(convertRecord(record));
    }
    Assert.assertEquals(this.writer.recordsWritten(), 3);
    this.writer.close();
    this.writer.commit();
    File outputFile = new File(TestConstants.TEST_OUTPUT_DIR + Path.SEPARATOR + this.filePath, TestConstants.TEST_FILE_NAME);
    DataFileReader<GenericRecord> reader = new DataFileReader<>(outputFile, new GenericDatumReader<GenericRecord>());
    Schema fileSchema = reader.getSchema();
    Assert.assertEquals(fileSchema.getProp(TEST_PROPERTY_KEY), TEST_PROPERTY_VALUE);
    // Read the records back and assert they are identical to the ones written
    GenericRecord user1 = reader.next();
    // Strings are in UTF8, so we have to call toString() here and below
    Assert.assertEquals(user1.get("name").toString(), "Alyssa");
    Assert.assertEquals(user1.get("favorite_number"), 256);
    Assert.assertEquals(user1.get("favorite_color").toString(), "yellow");
    GenericRecord user2 = reader.next();
    Assert.assertEquals(user2.get("name").toString(), "Ben");
    Assert.assertEquals(user2.get("favorite_number"), 7);
    Assert.assertEquals(user2.get("favorite_color").toString(), "red");
    GenericRecord user3 = reader.next();
    Assert.assertEquals(user3.get("name").toString(), "Charlie");
    Assert.assertEquals(user3.get("favorite_number"), 68);
    Assert.assertEquals(user3.get("favorite_color").toString(), "blue");
    reader.close();
    FsWriterMetrics metrics = FsWriterMetrics.fromJson(properties.getProp(FsDataWriter.FS_WRITER_METRICS_KEY));
    Assert.assertEquals(metrics.fileInfos.size(), 1);
    FsWriterMetrics.FileInfo fileInfo = metrics.fileInfos.iterator().next();
    Assert.assertEquals(fileInfo.fileName, TestConstants.TEST_FILE_NAME);
    Assert.assertEquals(fileInfo.numRecords, 3);
    Assert.assertNull(metrics.partitionInfo.partitionKey);
    Assert.assertEquals(metrics.partitionInfo.branchId, 0);
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) Schema(org.apache.avro.Schema) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.testng.annotations.Test)

Example 39 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroStringFieldEncryptorConverterTest method getRecordFromFile.

private GenericRecord getRecordFromFile(String path) throws IOException {
    DatumReader<GenericRecord> reader = new GenericDatumReader<>();
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(path), reader);
    if (dataFileReader.hasNext()) {
        return dataFileReader.next();
    }
    return null;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 40 with DataFileReader

use of org.apache.avro.file.DataFileReader in project incubator-gobblin by apache.

the class AvroRecursionEliminatingConverterTest method testConversion.

/**
 * Test schema and record conversion using a recursive schema
 */
@Test
public void testConversion() throws IOException {
    File inputFile = generateRecord();
    WorkUnitState workUnitState = new WorkUnitState();
    Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/recursive.avsc"));
    GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(inputSchema);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(inputFile, datumReader);
    GenericRecord inputRecord = dataFileReader.next();
    AvroRecursionEliminatingConverter converter = new AvroRecursionEliminatingConverter();
    Schema outputSchema = null;
    String recursiveFieldPath = "address.previous_address";
    // test that the inner recursive field is present in input schema
    Assert.assertTrue(AvroUtils.getFieldSchema(inputSchema, recursiveFieldPath).isPresent());
    try {
        outputSchema = converter.convertSchema(inputSchema, workUnitState);
        // test that the inner recursive field is no longer in the schema
        Assert.assertTrue(!AvroUtils.getFieldSchema(outputSchema, recursiveFieldPath).isPresent(), "Inner recursive field " + recursiveFieldPath + " should not be in output schema");
    } catch (SchemaConversionException e) {
        Assert.fail(e.getMessage());
    }
    GenericRecord outputRecord = null;
    try {
        outputRecord = converter.convertRecord(outputSchema, inputRecord, workUnitState).iterator().next();
    } catch (DataConversionException e) {
        Assert.fail(e.getMessage());
    }
    checkEquality("address.street_number", inputRecord, 1234, "Different value in input");
    checkEquality("address.street_number", outputRecord, 1234, "Different value in output");
    checkEquality("name", inputRecord, new Utf8("John"), "Different value in input");
    checkEquality("name", outputRecord, new Utf8("John"), "Different value in output");
    // check that inner address record exists in input record
    checkEquality("address.previous_address.city", inputRecord, new Utf8("San Francisco"), "Different value in input");
    checkEquality("address.previous_address", outputRecord, null, "Failed to remove recursive field");
}
Also used : SchemaConversionException(org.apache.gobblin.converter.SchemaConversionException) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) DataFileReader(org.apache.avro.file.DataFileReader) Utf8(org.apache.avro.util.Utf8) GenericRecord(org.apache.avro.generic.GenericRecord) DataConversionException(org.apache.gobblin.converter.DataConversionException) File(java.io.File) Test(org.testng.annotations.Test)

Aggregations

DataFileReader (org.apache.avro.file.DataFileReader)46 GenericRecord (org.apache.avro.generic.GenericRecord)28 File (java.io.File)26 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)21 Schema (org.apache.avro.Schema)20 Test (org.junit.Test)10 ArrayList (java.util.ArrayList)9 IOException (java.io.IOException)8 Test (org.testng.annotations.Test)7 SeekableInput (org.apache.avro.file.SeekableInput)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)6 Configuration (org.apache.hadoop.conf.Configuration)6 ReflectDatumReader (org.apache.avro.reflect.ReflectDatumReader)5 SeekableByteArrayInput (org.apache.avro.file.SeekableByteArrayInput)4 FsInput (org.apache.avro.mapred.FsInput)4 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)4 Utf8 (org.apache.avro.util.Utf8)4 JsonObject (com.google.gson.JsonObject)2 AvroDag (edu.snu.mist.formats.avro.AvroDag)2 Date (java.sql.Date)2