Search in sources :

Example 11 with DataFileReader

use of org.apache.avro.file.DataFileReader in project beam by apache.

the class AvroIOTest method testWindowedAvroIOWrite.

@Test
@Category({ ValidatesRunner.class, UsesTestStream.class })
public void testWindowedAvroIOWrite() throws Throwable {
    Path baseDir = Files.createTempDirectory(tmpFolder.getRoot().toPath(), "testwrite");
    String baseFilename = baseDir.resolve("prefix").toString();
    Instant base = new Instant(0);
    ArrayList<GenericClass> allElements = new ArrayList<>();
    ArrayList<TimestampedValue<GenericClass>> firstWindowElements = new ArrayList<>();
    ArrayList<Instant> firstWindowTimestamps = Lists.newArrayList(base.plus(Duration.standardSeconds(0)), base.plus(Duration.standardSeconds(10)), base.plus(Duration.standardSeconds(20)), base.plus(Duration.standardSeconds(30)));
    Random random = new Random();
    for (int i = 0; i < 100; ++i) {
        GenericClass item = new GenericClass(i, String.valueOf(i));
        allElements.add(item);
        firstWindowElements.add(TimestampedValue.of(item, firstWindowTimestamps.get(random.nextInt(firstWindowTimestamps.size()))));
    }
    ArrayList<TimestampedValue<GenericClass>> secondWindowElements = new ArrayList<>();
    ArrayList<Instant> secondWindowTimestamps = Lists.newArrayList(base.plus(Duration.standardSeconds(60)), base.plus(Duration.standardSeconds(70)), base.plus(Duration.standardSeconds(80)), base.plus(Duration.standardSeconds(90)));
    for (int i = 100; i < 200; ++i) {
        GenericClass item = new GenericClass(i, String.valueOf(i));
        allElements.add(new GenericClass(i, String.valueOf(i)));
        secondWindowElements.add(TimestampedValue.of(item, secondWindowTimestamps.get(random.nextInt(secondWindowTimestamps.size()))));
    }
    TimestampedValue<GenericClass>[] firstWindowArray = firstWindowElements.toArray(new TimestampedValue[100]);
    TimestampedValue<GenericClass>[] secondWindowArray = secondWindowElements.toArray(new TimestampedValue[100]);
    TestStream<GenericClass> values = TestStream.create(AvroCoder.of(GenericClass.class)).advanceWatermarkTo(new Instant(0)).addElements(firstWindowArray[0], Arrays.copyOfRange(firstWindowArray, 1, firstWindowArray.length)).advanceWatermarkTo(new Instant(0).plus(Duration.standardMinutes(1))).addElements(secondWindowArray[0], Arrays.copyOfRange(secondWindowArray, 1, secondWindowArray.length)).advanceWatermarkToInfinity();
    FilenamePolicy policy = new WindowedFilenamePolicy(baseFilename);
    windowedAvroWritePipeline.apply(values).apply(Window.<GenericClass>into(FixedWindows.of(Duration.standardMinutes(1)))).apply(AvroIO.write(GenericClass.class).to(baseFilename).withFilenamePolicy(policy).withWindowedWrites().withNumShards(2));
    windowedAvroWritePipeline.run();
    // Validate that the data written matches the expected elements in the expected order
    List<File> expectedFiles = new ArrayList<>();
    for (int shard = 0; shard < 2; shard++) {
        for (int window = 0; window < 2; window++) {
            Instant windowStart = new Instant(0).plus(Duration.standardMinutes(window));
            IntervalWindow intervalWindow = new IntervalWindow(windowStart, Duration.standardMinutes(1));
            expectedFiles.add(new File(baseFilename + "-" + intervalWindow.toString() + "-" + shard + "-of-1" + "-pane-0-final"));
        }
    }
    List<GenericClass> actualElements = new ArrayList<>();
    for (File outputFile : expectedFiles) {
        assertTrue("Expected output file " + outputFile.getAbsolutePath(), outputFile.exists());
        try (DataFileReader<GenericClass> reader = new DataFileReader<>(outputFile, new ReflectDatumReader<GenericClass>(ReflectData.get().getSchema(GenericClass.class)))) {
            Iterators.addAll(actualElements, reader);
        }
        outputFile.delete();
    }
    assertThat(actualElements, containsInAnyOrder(allElements.toArray()));
}
Also used : Path(java.nio.file.Path) Instant(org.joda.time.Instant) ArrayList(java.util.ArrayList) FilenamePolicy(org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy) DataFileReader(org.apache.avro.file.DataFileReader) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) Random(java.util.Random) File(java.io.File) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 12 with DataFileReader

use of org.apache.avro.file.DataFileReader in project sling by apache.

the class AvroContentSerializer method readAvroResources.

private Collection<AvroShallowResource> readAvroResources(byte[] bytes) throws IOException {
    DatumReader<AvroShallowResource> datumReader = new SpecificDatumReader<AvroShallowResource>(AvroShallowResource.class);
    DataFileReader<AvroShallowResource> dataFileReader = new DataFileReader<AvroShallowResource>(new SeekableByteArrayInput(bytes), datumReader);
    Collection<AvroShallowResource> avroResources = new LinkedList<AvroShallowResource>();
    try {
        for (AvroShallowResource avroResource : dataFileReader) {
            avroResources.add(avroResource);
        }
    } finally {
        dataFileReader.close();
    }
    return avroResources;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) SeekableByteArrayInput(org.apache.avro.file.SeekableByteArrayInput) LinkedList(java.util.LinkedList)

Example 13 with DataFileReader

use of org.apache.avro.file.DataFileReader in project spark-dataflow by cloudera.

the class AvroPipelineTest method readGenericFile.

private List<GenericRecord> readGenericFile() throws IOException {
    List<GenericRecord> records = Lists.newArrayList();
    GenericDatumReader<GenericRecord> genericDatumReader = new GenericDatumReader<>();
    try (DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(outputDir + "-00000-of-00001"), genericDatumReader)) {
        for (GenericRecord record : dataFileReader) {
            records.add(record);
        }
    }
    return records;
}
Also used : DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Aggregations

DataFileReader (org.apache.avro.file.DataFileReader)13 File (java.io.File)9 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)6 GenericRecord (org.apache.avro.generic.GenericRecord)6 ArrayList (java.util.ArrayList)4 Schema (org.apache.avro.Schema)3 ReflectDatumReader (org.apache.avro.reflect.ReflectDatumReader)3 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)3 Test (org.junit.Test)3 IOException (java.io.IOException)2 DataFileWriter (org.apache.avro.file.DataFileWriter)2 GenericData (org.apache.avro.generic.GenericData)2 JSONArray (org.json.JSONArray)2 JSONObject (org.json.JSONObject)2 UnmodifiableIterator (com.google.common.collect.UnmodifiableIterator)1 BufferedInputStream (java.io.BufferedInputStream)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 Path (java.nio.file.Path)1 PreparedStatement (java.sql.PreparedStatement)1