Search in sources :

Example 51 with DataFileStream

use of org.apache.avro.file.DataFileStream in project components by Talend.

the class AvroHdfsFileSink method mergeOutput.

@Override
protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) {
    try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) {
        FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
        Schema schema = null;
        String inputCodec = null;
        OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile)));
        for (FileStatus sourceStatus : sourceStatuses) {
            try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) {
                if (schema == null) {
                    schema = reader.getSchema();
                    for (String key : reader.getMetaKeys()) {
                        if (!DataFileWriter.isReservedMeta(key)) {
                            writer.setMeta(key, reader.getMeta(key));
                        }
                    }
                    inputCodec = reader.getMetaString(DataFileConstants.CODEC);
                    if (inputCodec == null) {
                        inputCodec = DataFileConstants.NULL_CODEC;
                    }
                    writer.setCodec(CodecFactory.fromString(inputCodec));
                    writer.create(schema, output);
                }
                writer.appendAllFrom(reader, false);
            }
        }
    } catch (Exception e) {
        LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage());
        return false;
    }
    return true;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) DataFileStream(org.apache.avro.file.DataFileStream) BufferedInputStream(java.io.BufferedInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) BufferedOutputStream(java.io.BufferedOutputStream)

Example 52 with DataFileStream

use of org.apache.avro.file.DataFileStream in project hive by apache.

the class AvroLazyObjectInspector method retrieveSchemaFromBytes.

/**
 * Retrieve schema from the given bytes
 *
 * @return the retrieved {@link Schema schema}
 */
private Schema retrieveSchemaFromBytes(byte[] data) {
    ByteArrayInputStream bais = new ByteArrayInputStream(data);
    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    Schema schema = null;
    try {
        // dfs is AutoCloseable
        @SuppressWarnings("resource") DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, reader);
        schema = dfs.getSchema();
    } catch (IOException ioe) {
        throw new AvroObjectInspectorException("An error occurred retrieving schema from bytes", ioe);
    }
    return schema;
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) IOException(java.io.IOException) GenericRecord(org.apache.avro.generic.GenericRecord) DataFileStream(org.apache.avro.file.DataFileStream)

Example 53 with DataFileStream

use of org.apache.avro.file.DataFileStream in project hive by apache.

the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.

@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
    // Need to verify that when reading a datum with an updated reader schema
    // that the datum then returns the reader schema as its own, since we
    // depend on this behavior in order to avoid re-encoding the datum
    // in the serde.
    String v0 = "{\n" + "    \"namespace\": \"org.apache.hadoop.hive\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        }\n" + "    ]\n" + "}";
    String v1 = "{\n" + "    \"namespace\": \"org.apache.hadoop.hive\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        },\n" + "        {\n" + "            \"name\":\"v1\",\n" + "            \"type\":\"string\",\n" + "            \"default\":\"v1_default\"" + "        }\n" + "    ]\n" + "}";
    Schema[] schemas = { AvroSerdeUtils.getSchemaFor(v0), AvroSerdeUtils.getSchemaFor(v1) };
    // Encode a schema with v0, write out.
    GenericRecord record = new GenericData.Record(schemas[0]);
    record.put("v0", "v0 value");
    assertTrue(GenericData.get().validate(schemas[0], record));
    // Write datum out to a stream
    GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
    DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    dfw.create(schemas[0], baos);
    dfw.append(record);
    dfw.close();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
    gdr.setExpected(schemas[1]);
    DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
    assertTrue(dfs.hasNext());
    GenericRecord next = dfs.next();
    assertEquals("v0 value", next.get("v0").toString());
    assertEquals("v1_default", next.get("v1").toString());
    // Now the most important check - when we query this record for its schema,
    // we should get back the latest, reader schema:
    assertEquals(schemas[1], next.getSchema());
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataFileStream(org.apache.avro.file.DataFileStream) ByteArrayInputStream(java.io.ByteArrayInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 54 with DataFileStream

use of org.apache.avro.file.DataFileStream in project haivvreo by jghoman.

the class TestThatEvolvedSchemasActAsWeWant method resolvedSchemasShouldReturnReaderSchema.

@Test
public void resolvedSchemasShouldReturnReaderSchema() throws IOException {
    // Need to verify that when reading a datum with an updated reader schema
    // that the datum then returns the reader schema as its own, since we
    // depend on this behavior in order to avoid re-encoding the datum
    // in the serde.
    String v0 = "{\n" + "    \"namespace\": \"com.linkedin.haivvreo\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        }\n" + "    ]\n" + "}";
    String v1 = "{\n" + "    \"namespace\": \"com.linkedin.haivvreo\",\n" + "    \"name\": \"SomeStuff\",\n" + "    \"type\": \"record\",\n" + "    \"fields\": [\n" + "        {\n" + "            \"name\":\"v0\",\n" + "            \"type\":\"string\"\n" + "        },\n" + "        {\n" + "            \"name\":\"v1\",\n" + "            \"type\":\"string\",\n" + "            \"default\":\"v1_default\"" + "        }\n" + "    ]\n" + "}";
    Schema[] schemas = { Schema.parse(v0), Schema.parse(v1) };
    // Encode a schema with v0, write out.
    GenericRecord record = new GenericData.Record(schemas[0]);
    record.put("v0", "v0 value");
    assertTrue(GenericData.get().validate(schemas[0], record));
    // Write datum out to a stream
    GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schemas[0]);
    DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    dfw.create(schemas[0], baos);
    dfw.append(record);
    dfw.close();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>();
    gdr.setExpected(schemas[1]);
    DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, gdr);
    assertTrue(dfs.hasNext());
    GenericRecord next = dfs.next();
    assertEquals("v0 value", next.get("v0").toString());
    assertEquals("v1_default", next.get("v1").toString());
    // Now the most important check - when we query this record for its schema,
    // we should get back the latest, reader schema:
    assertEquals(schemas[1], next.getSchema());
}
Also used : GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataFileStream(org.apache.avro.file.DataFileStream) ByteArrayInputStream(java.io.ByteArrayInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 55 with DataFileStream

use of org.apache.avro.file.DataFileStream in project voldemort by voldemort.

the class AvroUtils method getSchemaFromPath.

/**
 * Pull the schema off of the given file (if it is a file). If it is a
 * directory, then pull schemas off of all subfiles, and check that they are
 * all the same schema. If so, return that schema, otherwise throw an
 * exception
 *
 * @param fs The filesystem to use
 * @param path The path from which to get the schema
 * @return The schema of this file or all its subfiles
 * @throws IOException
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path) {
    try {
        if (fs.isFile(path)) {
            BufferedInputStream inStream = null;
            try {
                inStream = new BufferedInputStream(fs.open(path));
            } catch (IOException e1) {
                throw new RuntimeException("Unable to open " + path, e1);
            }
            GenericDatumReader datum = new GenericDatumReader();
            DataFileStream reader = null;
            try {
                reader = new DataFileStream(inStream, datum);
            } catch (IOException e) {
                throw new RuntimeException("Invalid avro format, path " + path, e);
            }
            return reader.getSchema();
        } else {
            FileStatus[] statuses = null;
            if (fs.isDirectory(path)) {
                // this is a directory, get schemas from all subfiles
                statuses = fs.listStatus(path);
                if (statuses == null || statuses.length == 0)
                    throw new IllegalArgumentException("No files in directory " + path);
            } else {
                // this is wildcard path, get schemas from all matched files
                statuses = fs.globStatus(path);
                if (statuses == null || statuses.length == 0)
                    throw new IllegalArgumentException("No matches for path pattern " + path);
            }
            List<Schema> schemas = new ArrayList<Schema>();
            for (FileStatus status : statuses) {
                if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                    schemas.add(getSchemaFromPath(fs, status.getPath()));
                }
            }
            // now check that all the schemas are the same
            if (schemas.size() > 0) {
                Schema schema = schemas.get(0);
                for (int i = 1; i < schemas.size(); i++) if (!schema.equals(schemas.get(i)))
                    throw new IllegalArgumentException("The directory " + path + " contains heterogeneous schemas: found both '" + schema + "' and '" + schemas.get(i) + "'.");
                return schema;
            } else {
                throw new IllegalArgumentException("No valid metadata file found for path " + path);
            }
        }
    } catch (Exception e) {
        throw new RuntimeException("Error getting schema for path " + path, e);
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) IOException(java.io.IOException) BufferedInputStream(java.io.BufferedInputStream)

Aggregations

DataFileStream (org.apache.avro.file.DataFileStream)59 GenericRecord (org.apache.avro.generic.GenericRecord)39 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)34 Test (org.junit.Test)26 Schema (org.apache.avro.Schema)21 ByteArrayInputStream (java.io.ByteArrayInputStream)20 InputStream (java.io.InputStream)19 IOException (java.io.IOException)13 ByteArrayOutputStream (java.io.ByteArrayOutputStream)11 File (java.io.File)9 FileInputStream (java.io.FileInputStream)9 ResultSet (java.sql.ResultSet)9 HashMap (java.util.HashMap)9 MockFlowFile (org.apache.nifi.util.MockFlowFile)9 Statement (java.sql.Statement)8 BufferedInputStream (java.io.BufferedInputStream)7 HashSet (java.util.HashSet)7 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)7 DataFileWriter (org.apache.avro.file.DataFileWriter)7 Path (org.apache.hadoop.fs.Path)7