Search in sources :

Example 6 with DataFileStream

use of org.apache.avro.file.DataFileStream in project beam by apache.

the class AvroIOTest method testAvroIOCompressedWriteAndReadASingleFile.

@Test
@SuppressWarnings("unchecked")
@Category(NeedsRunner.class)
public void testAvroIOCompressedWriteAndReadASingleFile() throws Throwable {
    List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar"));
    File outputFile = tmpFolder.newFile("output.avro");
    p.apply(Create.of(values)).apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding().withCodec(CodecFactory.deflateCodec(9)));
    p.run();
    PCollection<GenericClass> input = p.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()));
    PAssert.that(input).containsInAnyOrder(values);
    p.run();
    DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader());
    assertEquals("deflate", dataFileStream.getMetaString("avro.codec"));
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) DataFileStream(org.apache.avro.file.DataFileStream) File(java.io.File) FileInputStream(java.io.FileInputStream) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 7 with DataFileStream

use of org.apache.avro.file.DataFileStream in project flink by apache.

the class RollingSinkITCase method testNonRollingAvroKeyValueWithoutCompressionWriter.

/**
	 * This tests {@link AvroKeyValueSinkWriter}
	 * with non-rolling output and without compression.
	 */
@Test
public void testNonRollingAvroKeyValueWithoutCompressionWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
    Map<String, String> properties = new HashMap<>();
    Schema keySchema = Schema.create(Type.INT);
    Schema valueSchema = Schema.create(Type.STRING);
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
    RollingSink<Tuple2<Integer, String>> sink = new RollingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
    source.addSink(sink);
    env.execute("RollingSink Avro KeyValue Writer Test");
    GenericData.setStringType(valueSchema, StringType.String);
    Schema elementSchema = AvroKeyValue.getSchema(keySchema, valueSchema);
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
    SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<GenericRecord>(elementSchema);
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 0; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
    inStream = dfs.open(new Path(outPath + "/part-1-0"));
    dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 1; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) DataFileStream(org.apache.avro.file.DataFileStream) AvroKeyValue(org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter.AvroKeyValue) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 8 with DataFileStream

use of org.apache.avro.file.DataFileStream in project flink by apache.

the class BucketingSinkTest method testUserDefinedConfiguration.

/**
	 * This tests user defined hdfs configuration
	 * @throws Exception
	 */
@Test
public void testUserDefinedConfiguration() throws Exception {
    final String outPath = hdfsURI + "/string-non-rolling-with-config";
    final int numElements = 20;
    Map<String, String> properties = new HashMap<>();
    Schema keySchema = Schema.create(Schema.Type.INT);
    Schema valueSchema = Schema.create(Schema.Type.STRING);
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
    Configuration conf = new Configuration();
    conf.set("io.file.buffer.size", "40960");
    BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath).setFSConfig(conf).setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960")).setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
    OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0);
    testHarness.setProcessingTime(0L);
    testHarness.setup();
    testHarness.open();
    for (int i = 0; i < numElements; i++) {
        testHarness.processElement(new StreamRecord<>(Tuple2.of(i, "message #" + Integer.toString(i))));
    }
    testHarness.close();
    GenericData.setStringType(valueSchema, GenericData.StringType.String);
    Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));
    SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
    for (int i = 0; i < numElements; i++) {
        AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
        int key = wrappedEntry.getKey();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) AvroKeyValueSinkWriter(org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter) Path(org.apache.hadoop.fs.Path) DataFileStream(org.apache.avro.file.DataFileStream) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Test(org.junit.Test)

Example 9 with DataFileStream

use of org.apache.avro.file.DataFileStream in project flink by apache.

the class BucketingSinkTest method testNonRollingAvroKeyValueWithCompressionWriter.

/**
	 * This tests {@link AvroKeyValueSinkWriter}
	 * with non-rolling output and with compression.
	 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
    final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
    final int numElements = 20;
    Map<String, String> properties = new HashMap<>();
    Schema keySchema = Schema.create(Schema.Type.INT);
    Schema valueSchema = Schema.create(Schema.Type.STRING);
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
    BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
    OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0);
    testHarness.setProcessingTime(0L);
    testHarness.setup();
    testHarness.open();
    for (int i = 0; i < numElements; i++) {
        testHarness.processElement(new StreamRecord<>(Tuple2.of(i, "message #" + Integer.toString(i))));
    }
    testHarness.close();
    GenericData.setStringType(valueSchema, GenericData.StringType.String);
    Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));
    SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
    for (int i = 0; i < numElements; i++) {
        AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
        int key = wrappedEntry.getKey();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) DataFileStream(org.apache.avro.file.DataFileStream) TypeHint(org.apache.flink.api.common.typeinfo.TypeHint) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) AvroKeyValueSinkWriter(org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter) Test(org.junit.Test)

Example 10 with DataFileStream

use of org.apache.avro.file.DataFileStream in project hive by apache.

the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.

@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
    // Need to verify that when reading a datum with an updated reader schema
    // that the datum then returns the reader schema as its own, since we
    // depend on this behavior in order to avoid re-encoding the datum
    // in the serde.
    String v0 = "{\n" + "    \"namespace\": \"org.apache.hadoop.hive\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        }\n" + "    ]\n" + "}";
    String v1 = "{\n" + "    \"namespace\": \"org.apache.hadoop.hive\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        },\n" + "        {\n" + "            \"name\":\"v1\",\n" + "            \"type\":\"string\",\n" + "            \"default\":\"v1_default\"" + "        }\n" + "    ]\n" + "}";
    Schema[] schemas = { AvroSerdeUtils.getSchemaFor(v0), AvroSerdeUtils.getSchemaFor(v1) };
    // Encode a schema with v0, write out.
    GenericRecord record = new GenericData.Record(schemas[0]);
    record.put("v0", "v0 value");
    assertTrue(GenericData.get().validate(schemas[0], record));
    // Write datum out to a stream
    GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
    DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    dfw.create(schemas[0], baos);
    dfw.append(record);
    dfw.close();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
    gdr.setExpected(schemas[1]);
    DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
    assertTrue(dfs.hasNext());
    GenericRecord next = dfs.next();
    assertEquals("v0 value", next.get("v0").toString());
    assertEquals("v1_default", next.get("v1").toString());
    // Now the most important check - when we query this record for its schema,
    // we should get back the latest, reader schema:
    assertEquals(schemas[1], next.getSchema());
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataFileStream(org.apache.avro.file.DataFileStream) ByteArrayInputStream(java.io.ByteArrayInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Aggregations

DataFileStream (org.apache.avro.file.DataFileStream)18 GenericRecord (org.apache.avro.generic.GenericRecord)13 Test (org.junit.Test)10 Schema (org.apache.avro.Schema)9 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)9 FileInputStream (java.io.FileInputStream)8 File (java.io.File)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 HashMap (java.util.HashMap)4 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)4 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 Path (org.apache.hadoop.fs.Path)4 IOException (java.io.IOException)3 Field (org.apache.avro.Schema.Field)3 DataFileWriter (org.apache.avro.file.DataFileWriter)3 Category (org.junit.experimental.categories.Category)3 DimensionFieldSpec (com.linkedin.pinot.common.data.DimensionFieldSpec)2 FieldSpec (com.linkedin.pinot.common.data.FieldSpec)2 MetricFieldSpec (com.linkedin.pinot.common.data.MetricFieldSpec)2