use of org.apache.avro.file.DataFileStream in project pinot by linkedin.
the class BitmapInvertedIndexTest method testBitMapInvertedIndex.
void testBitMapInvertedIndex(ReadMode readMode) throws Exception {
IndexLoadingConfigMetadata indexLoadingConfig = new IndexLoadingConfigMetadata(new PropertiesConfiguration());
indexLoadingConfig.initLoadingInvertedIndexColumnSet(invertedIndexColumns);
final IndexSegmentImpl mmapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, readMode, indexLoadingConfig);
// compare the loaded inverted index with the record in avro file
final DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(new FileInputStream(new File(getClass().getClassLoader().getResource(AVRO_DATA).getFile())), new GenericDatumReader<GenericRecord>());
int docId = 0;
while (reader.hasNext()) {
final GenericRecord rec = reader.next();
for (final String column : ((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap().keySet()) {
Object entry = rec.get(column);
if (entry instanceof Utf8) {
entry = ((Utf8) entry).toString();
}
final int dicId = mmapSegment.getDictionaryFor(column).indexOf(entry);
// make sure that docId for dicId exist in the inverted index
Assert.assertTrue(mmapSegment.getInvertedIndexFor(column).getImmutable(dicId).contains(docId));
final int size = mmapSegment.getDictionaryFor(column).length();
for (int i = 0; i < size; ++i) {
// remove this for-loop for quick test
if (i == dicId) {
continue;
}
// make sure that docId for dicId does not exist in the inverted index
Assert.assertFalse(mmapSegment.getInvertedIndexFor(column).getImmutable(i).contains(docId));
}
}
++docId;
}
}
use of org.apache.avro.file.DataFileStream in project pinot by linkedin.
the class SegmentTestUtils method extractSchemaFromAvroWithoutTime.
public static Schema extractSchemaFromAvroWithoutTime(File avroFile) throws FileNotFoundException, IOException {
DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
Schema schema = new Schema();
for (final Field field : dataStream.getSchema().getFields()) {
try {
getColumnType(field);
} catch (Exception e) {
LOGGER.warn("Caught exception while converting Avro field {} of type {}, field will not be in schema.", field.name(), field.schema().getType());
continue;
}
final String columnName = field.name();
final String pinotType = field.getProp("pinotType");
final FieldSpec fieldSpec;
if (pinotType != null && "METRIC".equals(pinotType)) {
fieldSpec = new MetricFieldSpec();
} else {
fieldSpec = new DimensionFieldSpec();
}
fieldSpec.setName(columnName);
fieldSpec.setDataType(getColumnType(dataStream.getSchema().getField(columnName)));
fieldSpec.setSingleValueField(isSingleValueField(dataStream.getSchema().getField(columnName)));
schema.addField(fieldSpec);
}
dataStream.close();
return schema;
}
use of org.apache.avro.file.DataFileStream in project beam by apache.
the class AvroIOTest method testAvroIOCompressedWriteAndReadASingleFile.
@Test
@SuppressWarnings("unchecked")
@Category(NeedsRunner.class)
public void testAvroIOCompressedWriteAndReadASingleFile() throws Throwable {
List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar"));
File outputFile = tmpFolder.newFile("output.avro");
p.apply(Create.of(values)).apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding().withCodec(CodecFactory.deflateCodec(9)));
p.run();
PCollection<GenericClass> input = p.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()));
PAssert.that(input).containsInAnyOrder(values);
p.run();
DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader());
assertEquals("deflate", dataFileStream.getMetaString("avro.codec"));
}
use of org.apache.avro.file.DataFileStream in project flink by apache.
the class RollingSinkITCase method testNonRollingAvroKeyValueWithCompressionWriter.
/**
* This tests {@link AvroKeyValueSinkWriter}
* with non-rolling output and with compression.
*/
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
final int NUM_ELEMENTS = 20;
final int PARALLELISM = 2;
final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(PARALLELISM);
DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
Map<String, String> properties = new HashMap<>();
Schema keySchema = Schema.create(Type.INT);
Schema valueSchema = Schema.create(Type.STRING);
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
RollingSink<Tuple2<Integer, String>> sink = new RollingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
source.addSink(sink);
env.execute("RollingSink Avro KeyValue Writer Test");
GenericData.setStringType(valueSchema, StringType.String);
Schema elementSchema = AvroKeyValue.getSchema(keySchema, valueSchema);
FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<GenericRecord>(elementSchema);
DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
for (int i = 0; i < NUM_ELEMENTS; i += 2) {
AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
int key = wrappedEntry.getKey().intValue();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
inStream = dfs.open(new Path(outPath + "/part-1-0"));
dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
for (int i = 1; i < NUM_ELEMENTS; i += 2) {
AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
int key = wrappedEntry.getKey().intValue();
Assert.assertEquals(i, key);
String value = wrappedEntry.getValue();
Assert.assertEquals("message #" + i, value);
}
dataFileStream.close();
inStream.close();
}
use of org.apache.avro.file.DataFileStream in project nifi by apache.
the class ConvertAvroToJSON method onTrigger.
@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final String containerOption = context.getProperty(CONTAINER_OPTIONS).getValue();
final boolean useContainer = containerOption.equals(CONTAINER_ARRAY);
// Wrap a single record (inclusive of no records) only when a container is being used
final boolean wrapSingleRecord = context.getProperty(WRAP_SINGLE_RECORD).asBoolean() && useContainer;
final String stringSchema = context.getProperty(SCHEMA).getValue();
final boolean schemaLess = stringSchema != null;
try {
flowFile = session.write(flowFile, new StreamCallback() {
@Override
public void process(final InputStream rawIn, final OutputStream rawOut) throws IOException {
final GenericData genericData = GenericData.get();
if (schemaLess) {
if (schema == null) {
schema = new Schema.Parser().parse(stringSchema);
}
try (final InputStream in = new BufferedInputStream(rawIn);
final OutputStream out = new BufferedOutputStream(rawOut)) {
final DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
final BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(in, null);
final GenericRecord record = reader.read(null, decoder);
// need to be true before we wrap it with an array
if (useContainer && wrapSingleRecord) {
out.write('[');
}
final byte[] outputBytes = (record == null) ? EMPTY_JSON_OBJECT : genericData.toString(record).getBytes(StandardCharsets.UTF_8);
out.write(outputBytes);
if (useContainer && wrapSingleRecord) {
out.write(']');
}
}
} else {
try (final InputStream in = new BufferedInputStream(rawIn);
final OutputStream out = new BufferedOutputStream(rawOut);
final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
int recordCount = 0;
GenericRecord currRecord = null;
if (reader.hasNext()) {
currRecord = reader.next();
recordCount++;
}
// if configured to wrap single record
if (reader.hasNext() && useContainer || wrapSingleRecord) {
out.write('[');
}
// Determine the initial output record, inclusive if we should have an empty set of Avro records
final byte[] outputBytes = (currRecord == null) ? EMPTY_JSON_OBJECT : genericData.toString(currRecord).getBytes(StandardCharsets.UTF_8);
out.write(outputBytes);
while (reader.hasNext()) {
if (useContainer) {
out.write(',');
} else {
out.write('\n');
}
currRecord = reader.next(currRecord);
out.write(genericData.toString(currRecord).getBytes(StandardCharsets.UTF_8));
recordCount++;
}
// configured to wrap a single record
if (recordCount > 1 && useContainer || wrapSingleRecord) {
out.write(']');
}
}
}
}
});
} catch (final ProcessException pe) {
getLogger().error("Failed to convert {} from Avro to JSON due to {}; transferring to failure", new Object[] { flowFile, pe });
session.transfer(flowFile, REL_FAILURE);
return;
}
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/json");
session.transfer(flowFile, REL_SUCCESS);
}
Aggregations