Search in sources :

Example 76 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project mist by snuspl.

the class DefaultGroupCheckpointStore method saveQuery.

@Override
public boolean saveQuery(final AvroDag avroDag) {
    final String queryId = avroDag.getQueryId();
    try {
        final File storedFile = getQueryStoreFile(queryId);
        if (storedFile.exists()) {
            storedFile.delete();
            LOG.log(Level.INFO, "Deleting a duplicate query file");
        }
        final DataFileWriter<AvroDag> dataFileWriter = new DataFileWriter<>(avroDagDatumWriter);
        dataFileWriter.create(avroDag.getSchema(), storedFile);
        dataFileWriter.append(avroDag);
        dataFileWriter.close();
        LOG.log(Level.INFO, "Query {0} has been stored to disk.", queryId);
        return true;
    } catch (final Exception e) {
        e.printStackTrace();
        return false;
    }
}
Also used : AvroDag(edu.snu.mist.formats.avro.AvroDag) DataFileWriter(org.apache.avro.file.DataFileWriter) File(java.io.File) IOException(java.io.IOException)

Example 77 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project components by Talend.

the class AvroHdfsFileSink method mergeOutput.

@Override
protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) {
    try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) {
        FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
        Schema schema = null;
        String inputCodec = null;
        OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile)));
        for (FileStatus sourceStatus : sourceStatuses) {
            try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) {
                if (schema == null) {
                    schema = reader.getSchema();
                    for (String key : reader.getMetaKeys()) {
                        if (!DataFileWriter.isReservedMeta(key)) {
                            writer.setMeta(key, reader.getMeta(key));
                        }
                    }
                    inputCodec = reader.getMetaString(DataFileConstants.CODEC);
                    if (inputCodec == null) {
                        inputCodec = DataFileConstants.NULL_CODEC;
                    }
                    writer.setCodec(CodecFactory.fromString(inputCodec));
                    writer.create(schema, output);
                }
                writer.appendAllFrom(reader, false);
            }
        }
    } catch (Exception e) {
        LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage());
        return false;
    }
    return true;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.avro.Schema) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) DataFileStream(org.apache.avro.file.DataFileStream) BufferedInputStream(java.io.BufferedInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) BufferedOutputStream(java.io.BufferedOutputStream)

Example 78 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project components by Talend.

the class RecordSetUtil method writeRandomAvroFile.

/**
 * Writes all records from the test set into a single Avro file on the file system.
 *
 * @param fs The filesystem.
 * @param path The path of the file on the filesystem.
 * @param td The test data to write.
 * @throws IOException If there was an exception writing to the filesystem.
 */
public static void writeRandomAvroFile(FileSystem fs, String path, RecordSet td) throws IOException {
    try (OutputStream out = fs.create(new Path(path))) {
        DatumWriter<IndexedRecord> datumWriter = new GenericDatumWriter<>(td.getSchema());
        DataFileWriter<IndexedRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
        dataFileWriter.create(td.getSchema(), out);
        for (List<IndexedRecord> partition : td.getPartitions()) {
            for (IndexedRecord record : partition) {
                dataFileWriter.append(record);
            }
        }
        dataFileWriter.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) IndexedRecord(org.apache.avro.generic.IndexedRecord) OutputStream(java.io.OutputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter)

Example 79 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project kylo by Teradata.

the class JdbcCommon method convertToAvroStream.

public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, final RowVisitor visitor, final Schema schema) throws SQLException, IOException {
    int dateConversionWarning = 0;
    final GenericRecord rec = new GenericData.Record(schema);
    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, outStream);
        final ResultSetMetaData meta = rs.getMetaData();
        final int nrOfColumns = meta.getColumnCount();
        long nrOfRows = 0;
        while (rs.next()) {
            if (visitor != null) {
                visitor.visitRow(rs);
            }
            for (int i = 1; i <= nrOfColumns; i++) {
                final int javaSqlType = meta.getColumnType(i);
                final Object value = rs.getObject(i);
                if (value == null) {
                    rec.put(i - 1, null);
                } else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY || javaSqlType == ARRAY || javaSqlType == BLOB || javaSqlType == CLOB) {
                    // bytes requires little bit different handling
                    byte[] bytes = rs.getBytes(i);
                    ByteBuffer bb = ByteBuffer.wrap(bytes);
                    rec.put(i - 1, bb);
                } else if (value instanceof Byte) {
                    // tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
                    // But value is returned by JDBC as java.lang.Byte
                    // (at least H2 JDBC works this way)
                    // direct put to avro record results:
                    // org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
                    rec.put(i - 1, ((Byte) value).intValue());
                } else if (value instanceof BigDecimal || value instanceof BigInteger) {
                    // Avro can't handle BigDecimal and BigInteger as numbers - it will throw an AvroRuntimeException such as: "Unknown datum type: java.math.BigDecimal: 38"
                    rec.put(i - 1, value.toString());
                } else if (value instanceof Number || value instanceof Boolean) {
                    rec.put(i - 1, value);
                } else if (value instanceof Date) {
                    final DateTimeFormatter formatter = ISODateTimeFormat.dateTime().withZoneUTC();
                    rec.put(i - 1, formatter.print(new DateTime(((Date) value).getTime())));
                } else if (value instanceof Time) {
                    final DateTimeFormatter formatter = ISODateTimeFormat.time().withZoneUTC();
                    rec.put(i - 1, formatter.print(new DateTime(((Time) value).getTime())));
                } else if (value instanceof Timestamp) {
                    final DateTimeFormatter formatter = ISODateTimeFormat.dateTime().withZoneUTC();
                    rec.put(i - 1, formatter.print(new DateTime(((Timestamp) value).getTime())));
                } else {
                    // The different types that we support are numbers (int, long, double, float),
                    // as well as boolean values and Strings. Since Avro doesn't provide
                    // timestamp types, we want to convert those to Strings. So we will cast anything other
                    // than numbers or booleans to strings by using the toString() method.
                    rec.put(i - 1, value.toString());
                }
                // notify the visitor
                if (javaSqlType == Types.DATE || javaSqlType == Types.TIMESTAMP) {
                    Timestamp sqlDate = null;
                    try {
                        // Extract timestamp
                        sqlDate = extractSqlDate(rs, i);
                    } catch (Exception e) {
                        if (dateConversionWarning++ < 10) {
                            log.warn("{} is not convertible to timestamp or date", rs.getMetaData().getColumnName(i));
                        }
                    }
                    if (visitor != null) {
                        visitor.visitColumn(rs.getMetaData().getColumnName(i), javaSqlType, sqlDate);
                    }
                } else if (javaSqlType == Types.TIME) {
                    Time time = rs.getTime(i);
                    if (visitor != null) {
                        visitor.visitColumn(rs.getMetaData().getColumnName(i), javaSqlType, time);
                    }
                } else {
                    if (visitor != null) {
                        visitor.visitColumn(rs.getMetaData().getColumnName(i), javaSqlType, (value != null) ? value.toString() : null);
                    }
                }
            }
            dataFileWriter.append(rec);
            nrOfRows += 1;
        }
        return nrOfRows;
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Time(java.sql.Time) DateTime(org.joda.time.DateTime) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteBuffer(java.nio.ByteBuffer) Timestamp(java.sql.Timestamp) BigDecimal(java.math.BigDecimal) Date(java.sql.Date) DateTime(org.joda.time.DateTime) SQLException(java.sql.SQLException) IOException(java.io.IOException) ResultSetMetaData(java.sql.ResultSetMetaData) BigInteger(java.math.BigInteger) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) DateTimeFormatter(org.joda.time.format.DateTimeFormatter)

Example 80 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project kylo by Teradata.

the class JdbcCommon method convertToAvroStream.

/**
 * converts a JDBC result set to an Avro stream
 *
 * @param rs        The result set of the JDBC query
 * @param outStream The output stream to for the Avro formatted records
 * @return the number of rows converted to Avro format
 * @throws SQLException if errors occur while reading data from the database
 * @throws IOException  if unable to convert to Avro format
 */
public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream) throws SQLException, IOException {
    final Schema schema = createSchema(rs);
    final GenericRecord rec = new GenericData.Record(schema);
    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, outStream);
        final ResultSetMetaData meta = rs.getMetaData();
        final int nrOfColumns = meta.getColumnCount();
        long nrOfRows = 0;
        while (rs.next()) {
            for (int i = 1; i <= nrOfColumns; i++) {
                final int javaSqlType = meta.getColumnType(i);
                final Object value = rs.getObject(i);
                if (value == null) {
                    rec.put(i - 1, null);
                } else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY || javaSqlType == ARRAY || javaSqlType == BLOB || javaSqlType == CLOB) {
                    // bytes requires little bit different handling
                    byte[] bytes = rs.getBytes(i);
                    ByteBuffer bb = ByteBuffer.wrap(bytes);
                    rec.put(i - 1, bb);
                } else if (value instanceof Byte) {
                    // tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
                    // But value is returned by JDBC as java.lang.Byte
                    // (at least H2 JDBC works this way)
                    // direct put to avro record results:
                    // org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
                    rec.put(i - 1, ((Byte) value).intValue());
                } else if (value instanceof BigDecimal || value instanceof BigInteger) {
                    // Avro can't handle BigDecimal and BigInteger as numbers - it will throw an AvroRuntimeException such as: "Unknown datum type: java.math.BigDecimal: 38"
                    rec.put(i - 1, value.toString());
                } else if (value instanceof Number || value instanceof Boolean) {
                    rec.put(i - 1, value);
                } else {
                    // The different types that we support are numbers (int, long, double, float),
                    // as well as boolean values and Strings. Since Avro doesn't provide
                    // timestamp types, we want to convert those to Strings. So we will cast anything other
                    // than numbers or booleans to strings by using the toString() method.
                    rec.put(i - 1, value.toString());
                }
            }
            dataFileWriter.append(rec);
            nrOfRows += 1;
        }
        return nrOfRows;
    }
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ByteBuffer(java.nio.ByteBuffer) BigDecimal(java.math.BigDecimal) ResultSetMetaData(java.sql.ResultSetMetaData) BigInteger(java.math.BigInteger) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)102 GenericRecord (org.apache.avro.generic.GenericRecord)58 Schema (org.apache.avro.Schema)50 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)47 File (java.io.File)38 ByteArrayOutputStream (java.io.ByteArrayOutputStream)22 IOException (java.io.IOException)22 GenericData (org.apache.avro.generic.GenericData)17 FileOutputStream (java.io.FileOutputStream)15 Test (org.junit.Test)14 HashMap (java.util.HashMap)11 InputStream (java.io.InputStream)10 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)10 ArrayList (java.util.ArrayList)9 Path (org.apache.hadoop.fs.Path)9 ByteArrayInputStream (java.io.ByteArrayInputStream)8 OutputStream (java.io.OutputStream)8 ByteBuffer (java.nio.ByteBuffer)7 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)7 MockFlowFile (org.apache.nifi.util.MockFlowFile)7