use of org.apache.avro.file.DataFileStream in project flink by apache.
the class RollingSinkITCase method testNonRollingAvroKeyValueWithoutCompressionWriter.
/**
* This tests {@link AvroKeyValueSinkWriter}
* with non-rolling output and without compression.
*/
@Test
public void testNonRollingAvroKeyValueWithoutCompressionWriter() throws Exception {
final int NUM_ELEMENTS = 20;
final int PARALLELISM = 2;
final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(PARALLELISM);
DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
Map<String, String> properties = new HashMap<>();
Schema keySchema = Schema.create(Type.INT);
Schema valueSchema = Schema.create(Type.STRING);
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
RollingSink<Tuple2<Integer, String>> sink = new RollingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
source.addSink(sink);
env.execute("RollingSink Avro KeyValue Writer Test");
GenericData.setStringType(valueSchema, StringType.String);
Schema elementSchema = AvroKeyValue.getSchema(keySchema, valueSchema);
FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<GenericRecord>(elementSchema);
DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
for (int i = 0; i < NUM_ELEMENTS; i += 2) {
AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
int key = wrappedEntry.getKey().intValue();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
inStream = dfs.open(new Path(outPath + "/part-1-0"));
dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
for (int i = 1; i < NUM_ELEMENTS; i += 2) {
AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
int key = wrappedEntry.getKey().intValue();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
}
use of org.apache.avro.file.DataFileStream in project flink by apache.
the class BucketingSinkTest method testUserDefinedConfiguration.
/**
* This tests user defined hdfs configuration
* @throws Exception
*/
@Test
public void testUserDefinedConfiguration() throws Exception {
final String outPath = hdfsURI + "/string-non-rolling-with-config";
final int numElements = 20;
Map<String, String> properties = new HashMap<>();
Schema keySchema = Schema.create(Schema.Type.INT);
Schema valueSchema = Schema.create(Schema.Type.STRING);
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
Configuration conf = new Configuration();
conf.set("io.file.buffer.size", "40960");
BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath).setFSConfig(conf).setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960")).setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0);
testHarness.setProcessingTime(0L);
testHarness.setup();
testHarness.open();
for (int i = 0; i < numElements; i++) {
testHarness.processElement(new StreamRecord<>(Tuple2.of(i, "message #" + Integer.toString(i))));
}
testHarness.close();
GenericData.setStringType(valueSchema, GenericData.StringType.String);
Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);
FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));
SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
for (int i = 0; i < numElements; i++) {
AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
int key = wrappedEntry.getKey();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
}
use of org.apache.avro.file.DataFileStream in project flink by apache.
the class BucketingSinkTest method testNonRollingAvroKeyValueWithCompressionWriter.
/**
* This tests {@link AvroKeyValueSinkWriter}
* with non-rolling output and with compression.
*/
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
final int numElements = 20;
Map<String, String> properties = new HashMap<>();
Schema keySchema = Schema.create(Schema.Type.INT);
Schema valueSchema = Schema.create(Schema.Type.STRING);
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()).setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");
OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0);
testHarness.setProcessingTime(0L);
testHarness.setup();
testHarness.open();
for (int i = 0; i < numElements; i++) {
testHarness.processElement(new StreamRecord<>(Tuple2.of(i, "message #" + Integer.toString(i))));
}
testHarness.close();
GenericData.setStringType(valueSchema, GenericData.StringType.String);
Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);
FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));
SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
for (int i = 0; i < numElements; i++) {
AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
int key = wrappedEntry.getKey();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
}
use of org.apache.avro.file.DataFileStream in project pinot by linkedin.
the class SegmentTestUtils method getColumnNamesFromAvro.
public static List<String> getColumnNamesFromAvro(File avro) throws FileNotFoundException, IOException {
List<String> ret = new ArrayList<String>();
DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>());
for (final Field field : dataStream.getSchema().getFields()) {
ret.add(field.name());
}
return ret;
}
use of org.apache.avro.file.DataFileStream in project pinot by linkedin.
the class SegmentTestUtils method extractSchemaFromAvro.
public static Schema extractSchemaFromAvro(File avroFile, Map<String, FieldType> fieldTypeMap, TimeUnit granularity) throws IOException {
DataFileStream<GenericRecord> dataStream = new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
Schema schema = new Schema();
for (final Field field : dataStream.getSchema().getFields()) {
final String columnName = field.name();
FieldType fieldType = fieldTypeMap.get(columnName);
Preconditions.checkNotNull(fieldType);
switch(fieldType) {
case TIME:
final TimeGranularitySpec gSpec = new TimeGranularitySpec(getColumnType(field), granularity, columnName);
final TimeFieldSpec fSpec = new TimeFieldSpec(gSpec);
schema.addField(fSpec);
continue;
case DIMENSION:
final FieldSpec dimensionFieldSpec = new DimensionFieldSpec(columnName, getColumnType(field), isSingleValueField(field));
schema.addField(dimensionFieldSpec);
continue;
case METRIC:
final FieldSpec metricFieldSpec = new MetricFieldSpec(columnName, getColumnType(field));
schema.addField(metricFieldSpec);
continue;
default:
throw new UnsupportedOperationException("Unsupported field type: " + fieldType);
}
}
dataStream.close();
return schema;
}
Aggregations