Examples with DataFileStream - org.apache.avro.file.DataFileStream

Example 1 with DataFileStream

use of org.apache.avro.file.DataFileStream in project pinot by linkedin.

the class BitmapInvertedIndexTest method testBitMapInvertedIndex.

void testBitMapInvertedIndex(ReadMode readMode) throws Exception {
    IndexLoadingConfigMetadata indexLoadingConfig = new IndexLoadingConfigMetadata(new PropertiesConfiguration());
    indexLoadingConfig.initLoadingInvertedIndexColumnSet(invertedIndexColumns);
    final IndexSegmentImpl mmapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, readMode, indexLoadingConfig);
    // compare the loaded inverted index with the record in avro file
    final DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(new FileInputStream(new File(getClass().getClassLoader().getResource(AVRO_DATA).getFile())), new GenericDatumReader<GenericRecord>());
    int docId = 0;
    while (reader.hasNext()) {
        final GenericRecord rec = reader.next();
        for (final String column : ((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap().keySet()) {
            Object entry = rec.get(column);
            if (entry instanceof Utf8) {
                entry = ((Utf8) entry).toString();
            }
            final int dicId = mmapSegment.getDictionaryFor(column).indexOf(entry);
            // make sure that docId for dicId exist in the inverted index
            Assert.assertTrue(mmapSegment.getInvertedIndexFor(column).getImmutable(dicId).contains(docId));
            final int size = mmapSegment.getDictionaryFor(column).length();
            for (int i = 0; i < size; ++i) {
                // remove this for-loop for quick test
                if (i == dicId) {
                    continue;
                }
                // make sure that docId for dicId does not exist in the inverted index
                Assert.assertFalse(mmapSegment.getInvertedIndexFor(column).getImmutable(i).contains(docId));
            }
        }
        ++docId;
    }
}

Also used : IndexLoadingConfigMetadata(com.linkedin.pinot.common.metadata.segment.IndexLoadingConfigMetadata) DataFileStream(org.apache.avro.file.DataFileStream) PropertiesConfiguration(org.apache.commons.configuration.PropertiesConfiguration) FileInputStream(java.io.FileInputStream) IndexSegmentImpl(com.linkedin.pinot.core.segment.index.IndexSegmentImpl) Utf8(org.apache.avro.util.Utf8) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 2 with DataFileStream

use of org.apache.avro.file.DataFileStream in project pinot by linkedin.

the class SegmentTestUtils method extractSchemaFromAvroWithoutTime.

public static Schema extractSchemaFromAvroWithoutTime(File avroFile) throws FileNotFoundException, IOException {
    DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
    Schema schema = new Schema();
    for (final Field field : dataStream.getSchema().getFields()) {
        try {
            getColumnType(field);
        } catch (Exception e) {
            LOGGER.warn("Caught exception while converting Avro field {} of type {}, field will not be in schema.", field.name(), field.schema().getType());
            continue;
        }
        final String columnName = field.name();
        final String pinotType = field.getProp("pinotType");
        final FieldSpec fieldSpec;
        if (pinotType != null && "METRIC".equals(pinotType)) {
            fieldSpec = new MetricFieldSpec();
        } else {
            fieldSpec = new DimensionFieldSpec();
        }
        fieldSpec.setName(columnName);
        fieldSpec.setDataType(getColumnType(dataStream.getSchema().getField(columnName)));
        fieldSpec.setSingleValueField(isSingleValueField(dataStream.getSchema().getField(columnName)));
        schema.addField(fieldSpec);
    }
    dataStream.close();
    return schema;
}

Also used : Field(org.apache.avro.Schema.Field) Schema(com.linkedin.pinot.common.data.Schema) DataFileStream(org.apache.avro.file.DataFileStream) GenericRecord(org.apache.avro.generic.GenericRecord) MetricFieldSpec(com.linkedin.pinot.common.data.MetricFieldSpec) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) TimeFieldSpec(com.linkedin.pinot.common.data.TimeFieldSpec) FieldSpec(com.linkedin.pinot.common.data.FieldSpec) MetricFieldSpec(com.linkedin.pinot.common.data.MetricFieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec)

Example 3 with DataFileStream

use of org.apache.avro.file.DataFileStream in project beam by apache.

the class AvroIOTest method testAvroIOCompressedWriteAndReadASingleFile.

@Test
@SuppressWarnings("unchecked")
@Category(NeedsRunner.class)
public void testAvroIOCompressedWriteAndReadASingleFile() throws Throwable {
    List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar"));
    File outputFile = tmpFolder.newFile("output.avro");
    p.apply(Create.of(values)).apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding().withCodec(CodecFactory.deflateCodec(9)));
    p.run();
    PCollection<GenericClass> input = p.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()));
    PAssert.that(input).containsInAnyOrder(values);
    p.run();
    DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader());
    assertEquals("deflate", dataFileStream.getMetaString("avro.codec"));
}

Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) DataFileStream(org.apache.avro.file.DataFileStream) File(java.io.File) FileInputStream(java.io.FileInputStream) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 4 with DataFileStream

use of org.apache.avro.file.DataFileStream in project flink by apache.

the class RollingSinkITCase method testNonRollingAvroKeyValueWithCompressionWriter.

/**
	 * This tests {@link AvroKeyValueSinkWriter}
	 * with non-rolling output and with compression.
	 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);
    DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());
    Map<String, String> properties = new HashMap<>();
    Schema keySchema = Schema.create(Type.INT);
    Schema valueSchema = Schema.create(Type.STRING);
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
    properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);
    RollingSink<Tuple2<Integer, String>> sink = new RollingSink<Tuple2<Integer, String>>(outPath).setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties)).setBucketer(new NonRollingBucketer()).setPartPrefix("part").setPendingPrefix("").setPendingSuffix("");
    source.addSink(sink);
    env.execute("RollingSink Avro KeyValue Writer Test");
    GenericData.setStringType(valueSchema, StringType.String);
    Schema elementSchema = AvroKeyValue.getSchema(keySchema, valueSchema);
    FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));
    SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<GenericRecord>(elementSchema);
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 0; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
    inStream = dfs.open(new Path(outPath + "/part-1-0"));
    dataFileStream = new DataFileStream<GenericRecord>(inStream, elementReader);
    for (int i = 1; i < NUM_ELEMENTS; i += 2) {
        AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValue<Integer, String>(dataFileStream.next());
        int key = wrappedEntry.getKey().intValue();
        Assert.assertEquals(i, key);
        String value = wrappedEntry.getValue();
        Assert.assertEquals("message #" + i, value);
    }
    dataFileStream.close();
    inStream.close();
}

Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) DataFileStream(org.apache.avro.file.DataFileStream) AvroKeyValue(org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter.AvroKeyValue) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 5 with DataFileStream

use of org.apache.avro.file.DataFileStream in project nifi by apache.

the class ConvertAvroToJSON method onTrigger.

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final String containerOption = context.getProperty(CONTAINER_OPTIONS).getValue();
    final boolean useContainer = containerOption.equals(CONTAINER_ARRAY);
    // Wrap a single record (inclusive of no records) only when a container is being used
    final boolean wrapSingleRecord = context.getProperty(WRAP_SINGLE_RECORD).asBoolean() && useContainer;
    final String stringSchema = context.getProperty(SCHEMA).getValue();
    final boolean schemaLess = stringSchema != null;
    try {
        flowFile = session.write(flowFile, new StreamCallback() {

            @Override
            public void process(final InputStream rawIn, final OutputStream rawOut) throws IOException {
                final GenericData genericData = GenericData.get();
                if (schemaLess) {
                    if (schema == null) {
                        schema = new Schema.Parser().parse(stringSchema);
                    }
                    try (final InputStream in = new BufferedInputStream(rawIn);
                        final OutputStream out = new BufferedOutputStream(rawOut)) {
                        final DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
                        final BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(in, null);
                        final GenericRecord record = reader.read(null, decoder);
                        // need to be true before we wrap it with an array
                        if (useContainer && wrapSingleRecord) {
                            out.write('[');
                        }
                        final byte[] outputBytes = (record == null) ? EMPTY_JSON_OBJECT : genericData.toString(record).getBytes(StandardCharsets.UTF_8);
                        out.write(outputBytes);
                        if (useContainer && wrapSingleRecord) {
                            out.write(']');
                        }
                    }
                } else {
                    try (final InputStream in = new BufferedInputStream(rawIn);
                        final OutputStream out = new BufferedOutputStream(rawOut);
                        final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
                        int recordCount = 0;
                        GenericRecord currRecord = null;
                        if (reader.hasNext()) {
                            currRecord = reader.next();
                            recordCount++;
                        }
                        // if configured to wrap single record
                        if (reader.hasNext() && useContainer || wrapSingleRecord) {
                            out.write('[');
                        }
                        // Determine the initial output record, inclusive if we should have an empty set of Avro records
                        final byte[] outputBytes = (currRecord == null) ? EMPTY_JSON_OBJECT : genericData.toString(currRecord).getBytes(StandardCharsets.UTF_8);
                        out.write(outputBytes);
                        while (reader.hasNext()) {
                            if (useContainer) {
                                out.write(',');
                            } else {
                                out.write('\n');
                            }
                            currRecord = reader.next(currRecord);
                            out.write(genericData.toString(currRecord).getBytes(StandardCharsets.UTF_8));
                            recordCount++;
                        }
                        // configured to wrap a single record
                        if (recordCount > 1 && useContainer || wrapSingleRecord) {
                            out.write(']');
                        }
                    }
                }
            }
        });
    } catch (final ProcessException pe) {
        getLogger().error("Failed to convert {} from Avro to JSON due to {}; transferring to failure", new Object[] { flowFile, pe });
        session.transfer(flowFile, REL_FAILURE);
        return;
    }
    flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/json");
    session.transfer(flowFile, REL_SUCCESS);
}

Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) DataFileStream(org.apache.avro.file.DataFileStream) GenericData(org.apache.avro.generic.GenericData) StreamCallback(org.apache.nifi.processor.io.StreamCallback) BinaryDecoder(org.apache.avro.io.BinaryDecoder) ProcessException(org.apache.nifi.processor.exception.ProcessException) BufferedInputStream(java.io.BufferedInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) BufferedOutputStream(java.io.BufferedOutputStream)

Aggregations

DataFileStream (org.apache.avro.file.DataFileStream)53 GenericRecord (org.apache.avro.generic.GenericRecord)39 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)34 Test (org.junit.Test)25 ByteArrayInputStream (java.io.ByteArrayInputStream)20 Schema (org.apache.avro.Schema)20 InputStream (java.io.InputStream)19 IOException (java.io.IOException)12 ByteArrayOutputStream (java.io.ByteArrayOutputStream)11 File (java.io.File)9 FileInputStream (java.io.FileInputStream)9 ResultSet (java.sql.ResultSet)9 HashMap (java.util.HashMap)9 MockFlowFile (org.apache.nifi.util.MockFlowFile)9 Statement (java.sql.Statement)8 BufferedInputStream (java.io.BufferedInputStream)7 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)7 Path (org.apache.hadoop.fs.Path)7 Connection (java.sql.Connection)6 DataFileWriter (org.apache.avro.file.DataFileWriter)6