Search in sources :

Example 11 with DataFileStream

use of org.apache.avro.file.DataFileStream in project hive by apache.

the class AvroLazyObjectInspector method retrieveSchemaFromBytes.

/**
   * Retrieve schema from the given bytes
   *
   * @return the retrieved {@link Schema schema}
   * */
private Schema retrieveSchemaFromBytes(byte[] data) {
    ByteArrayInputStream bais = new ByteArrayInputStream(data);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    Schema schema = null;
    try {
        // dfs is AutoCloseable
        @SuppressWarnings("resource") DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, reader);
        schema = dfs.getSchema();
    } catch (IOException ioe) {
        throw new AvroObjectInspectorException("An error occurred retrieving schema from bytes", ioe);
    }
    return schema;
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord) DataFileStream(org.apache.avro.file.DataFileStream)

Example 12 with DataFileStream

use of org.apache.avro.file.DataFileStream in project haivvreo by jghoman.

the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.

@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
    // Need to verify that when reading a datum with an updated reader schema
    // that the datum then returns the reader schema as its own, since we
    // depend on this behavior in order to avoid re-encoding the datum
    // in the serde.
    String v0 = "{\n" + "    \"namespace\": \"com.linkedin.haivvreo\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        }\n" + "    ]\n" + "}";
    String v1 = "{\n" + "    \"namespace\": \"com.linkedin.haivvreo\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        },\n" + "        {\n" + "            \"name\":\"v1\",\n" + "            \"type\":\"string\",\n" + "            \"default\":\"v1_default\"" + "        }\n" + "    ]\n" + "}";
    Schema[] schemas = { Schema.parse(v0), Schema.parse(v1) };
    // Encode a schema with v0, write out.
    GenericRecord record = new GenericData.Record(schemas[0]);
    record.put("v0", "v0 value");
    assertTrue(GenericData.get().validate(schemas[0], record));
    // Write datum out to a stream
    GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
    DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    dfw.create(schemas[0], baos);
    dfw.append(record);
    dfw.close();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
    gdr.setExpected(schemas[1]);
    DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
    assertTrue(dfs.hasNext());
    GenericRecord next = dfs.next();
    assertEquals("v0 value", next.get("v0").toString());
    assertEquals("v1_default", next.get("v1").toString());
    // Now the most important check - when we query this record for its schema,
    // we should get back the latest, reader schema:
    assertEquals(schemas[1], next.getSchema());
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataFileStream(org.apache.avro.file.DataFileStream) ByteArrayInputStream(java.io.ByteArrayInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 13 with DataFileStream

use of org.apache.avro.file.DataFileStream in project pinot by linkedin.

the class SegmentTestUtils method getColumnNamesFromAvro.

public static List<String> getColumnNamesFromAvro(File avro) throws FileNotFoundException, IOException {
    List<String> ret = new ArrayList<String>();
    DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>());
    for (final Field field : dataStream.getSchema().getFields()) {
        ret.add(field.name());
    }
    return ret;
}
Also used : Field(org.apache.avro.Schema.Field) ArrayList(java.util.ArrayList) DataFileStream(org.apache.avro.file.DataFileStream) GenericRecord(org.apache.avro.generic.GenericRecord) FileInputStream(java.io.FileInputStream)

Example 14 with DataFileStream

use of org.apache.avro.file.DataFileStream in project pinot by linkedin.

the class SegmentTestUtils method extractSchemaFromAvro.

public static Schema extractSchemaFromAvro(File avroFile, Map<String, FieldType> fieldTypeMap, TimeUnit granularity) throws IOException {
    DataFileStream<GenericRecord> dataStream = new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
    Schema schema = new Schema();
    for (final Field field : dataStream.getSchema().getFields()) {
        final String columnName = field.name();
        FieldType fieldType = fieldTypeMap.get(columnName);
        Preconditions.checkNotNull(fieldType);
        switch(fieldType) {
            case TIME:
                final TimeGranularitySpec gSpec = new TimeGranularitySpec(getColumnType(field), granularity, columnName);
                final TimeFieldSpec fSpec = new TimeFieldSpec(gSpec);
                schema.addField(fSpec);
                continue;
            case DIMENSION:
                final FieldSpec dimensionFieldSpec = new DimensionFieldSpec(columnName, getColumnType(field), isSingleValueField(field));
                schema.addField(dimensionFieldSpec);
                continue;
            case METRIC:
                final FieldSpec metricFieldSpec = new MetricFieldSpec(columnName, getColumnType(field));
                schema.addField(metricFieldSpec);
                continue;
            default:
                throw new UnsupportedOperationException("Unsupported field type: " + fieldType);
        }
    }
    dataStream.close();
    return schema;
}
Also used : TimeGranularitySpec(com.linkedin.pinot.common.data.TimeGranularitySpec) Schema(com.linkedin.pinot.common.data.Schema) TimeFieldSpec(com.linkedin.pinot.common.data.TimeFieldSpec) DataFileStream(org.apache.avro.file.DataFileStream) MetricFieldSpec(com.linkedin.pinot.common.data.MetricFieldSpec) FileInputStream(java.io.FileInputStream) TimeFieldSpec(com.linkedin.pinot.common.data.TimeFieldSpec) FieldSpec(com.linkedin.pinot.common.data.FieldSpec) MetricFieldSpec(com.linkedin.pinot.common.data.MetricFieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec) FieldType(com.linkedin.pinot.common.data.FieldSpec.FieldType) Field(org.apache.avro.Schema.Field) GenericRecord(org.apache.avro.generic.GenericRecord) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec)

Example 15 with DataFileStream

use of org.apache.avro.file.DataFileStream in project voldemort by voldemort.

the class AvroUtils method getSchemaFromPath.

/**
     * Pull the schema off of the given file (if it is a file). If it is a
     * directory, then pull schemas off of all subfiles, and check that they are
     * all the same schema. If so, return that schema, otherwise throw an
     * exception
     * 
     * @param fs The filesystem to use
     * @param path The path from which to get the schema
     * @return The schema of this file or all its subfiles
     * @throws IOException
     */
@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path) {
    try {
        if (fs.isFile(path)) {
            BufferedInputStream inStream = null;
            try {
                inStream = new BufferedInputStream(fs.open(path));
            } catch (IOException e1) {
                throw new RuntimeException("Unable to open " + path, e1);
            }
            GenericDatumReader datum = new GenericDatumReader();
            DataFileStream reader = null;
            try {
                reader = new DataFileStream(inStream, datum);
            } catch (IOException e) {
                throw new RuntimeException("Invalid avro format, path " + path, e);
            }
            return reader.getSchema();
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
                if (statuses == null || statuses.length == 0)
                    throw new IllegalArgumentException("No files in directory " + path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
                if (statuses == null || statuses.length == 0)
                    throw new IllegalArgumentException("No matches for path pattern " + path);
            }
            List<Schema> schemas = new ArrayList<Schema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    schemas.add(getSchemaFromPath(fs, status.getPath()));
                }
            }
            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                Schema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i)))
                    throw new IllegalArgumentException("The directory " + path + " contains heterogeneous schemas: found both '" + schema + "' and '" + schemas.get(i) + "'.");
                return schema;
            } else {
                throw new IllegalArgumentException("No valid metadata file found for path " + path);
            }
        }
    } catch (Exception e) {
        throw new RuntimeException("Error getting schema for path " + path, e);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) IOException(java.io.IOException) BufferedInputStream(java.io.BufferedInputStream)

Aggregations

DataFileStream (org.apache.avro.file.DataFileStream)18 GenericRecord (org.apache.avro.generic.GenericRecord)13 Test (org.junit.Test)10 Schema (org.apache.avro.Schema)9 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)9 FileInputStream (java.io.FileInputStream)8 File (java.io.File)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 HashMap (java.util.HashMap)4 SpecificDatumReader (org.apache.avro.specific.SpecificDatumReader)4 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 Path (org.apache.hadoop.fs.Path)4 IOException (java.io.IOException)3 Field (org.apache.avro.Schema.Field)3 DataFileWriter (org.apache.avro.file.DataFileWriter)3 Category (org.junit.experimental.categories.Category)3 DimensionFieldSpec (com.linkedin.pinot.common.data.DimensionFieldSpec)2 FieldSpec (com.linkedin.pinot.common.data.FieldSpec)2 MetricFieldSpec (com.linkedin.pinot.common.data.MetricFieldSpec)2