use of org.apache.avro.file.DataFileWriter in project mist by snuspl.
the class DefaultGroupCheckpointStore method saveQuery.
@Override
public boolean saveQuery(final AvroDag avroDag) {
final String queryId = avroDag.getQueryId();
try {
final File storedFile = getQueryStoreFile(queryId);
if (storedFile.exists()) {
storedFile.delete();
LOG.log(Level.INFO, "Deleting a duplicate query file");
}
final DataFileWriter<AvroDag> dataFileWriter = new DataFileWriter<>(avroDagDatumWriter);
dataFileWriter.create(avroDag.getSchema(), storedFile);
dataFileWriter.append(avroDag);
dataFileWriter.close();
LOG.log(Level.INFO, "Query {0} has been stored to disk.", queryId);
return true;
} catch (final Exception e) {
e.printStackTrace();
return false;
}
}
use of org.apache.avro.file.DataFileWriter in project components by Talend.
the class AvroHdfsFileSink method mergeOutput.
@Override
protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) {
try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) {
FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
Schema schema = null;
String inputCodec = null;
OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile)));
for (FileStatus sourceStatus : sourceStatuses) {
try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) {
if (schema == null) {
schema = reader.getSchema();
for (String key : reader.getMetaKeys()) {
if (!DataFileWriter.isReservedMeta(key)) {
writer.setMeta(key, reader.getMeta(key));
}
}
inputCodec = reader.getMetaString(DataFileConstants.CODEC);
if (inputCodec == null) {
inputCodec = DataFileConstants.NULL_CODEC;
}
writer.setCodec(CodecFactory.fromString(inputCodec));
writer.create(schema, output);
}
writer.appendAllFrom(reader, false);
}
}
} catch (Exception e) {
LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage());
return false;
}
return true;
}
use of org.apache.avro.file.DataFileWriter in project components by Talend.
the class RecordSetUtil method writeRandomAvroFile.
/**
* Writes all records from the test set into a single Avro file on the file system.
*
* @param fs The filesystem.
* @param path The path of the file on the filesystem.
* @param td The test data to write.
* @throws IOException If there was an exception writing to the filesystem.
*/
public static void writeRandomAvroFile(FileSystem fs, String path, RecordSet td) throws IOException {
try (OutputStream out = fs.create(new Path(path))) {
DatumWriter<IndexedRecord> datumWriter = new GenericDatumWriter<>(td.getSchema());
DataFileWriter<IndexedRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
dataFileWriter.create(td.getSchema(), out);
for (List<IndexedRecord> partition : td.getPartitions()) {
for (IndexedRecord record : partition) {
dataFileWriter.append(record);
}
}
dataFileWriter.close();
}
}
use of org.apache.avro.file.DataFileWriter in project kylo by Teradata.
the class JdbcCommon method convertToAvroStream.
public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, final RowVisitor visitor, final Schema schema) throws SQLException, IOException {
int dateConversionWarning = 0;
final GenericRecord rec = new GenericData.Record(schema);
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, outStream);
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
long nrOfRows = 0;
while (rs.next()) {
if (visitor != null) {
visitor.visitRow(rs);
}
for (int i = 1; i <= nrOfColumns; i++) {
final int javaSqlType = meta.getColumnType(i);
final Object value = rs.getObject(i);
if (value == null) {
rec.put(i - 1, null);
} else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY || javaSqlType == ARRAY || javaSqlType == BLOB || javaSqlType == CLOB) {
// bytes requires little bit different handling
byte[] bytes = rs.getBytes(i);
ByteBuffer bb = ByteBuffer.wrap(bytes);
rec.put(i - 1, bb);
} else if (value instanceof Byte) {
// tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
// But value is returned by JDBC as java.lang.Byte
// (at least H2 JDBC works this way)
// direct put to avro record results:
// org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
rec.put(i - 1, ((Byte) value).intValue());
} else if (value instanceof BigDecimal || value instanceof BigInteger) {
// Avro can't handle BigDecimal and BigInteger as numbers - it will throw an AvroRuntimeException such as: "Unknown datum type: java.math.BigDecimal: 38"
rec.put(i - 1, value.toString());
} else if (value instanceof Number || value instanceof Boolean) {
rec.put(i - 1, value);
} else if (value instanceof Date) {
final DateTimeFormatter formatter = ISODateTimeFormat.dateTime().withZoneUTC();
rec.put(i - 1, formatter.print(new DateTime(((Date) value).getTime())));
} else if (value instanceof Time) {
final DateTimeFormatter formatter = ISODateTimeFormat.time().withZoneUTC();
rec.put(i - 1, formatter.print(new DateTime(((Time) value).getTime())));
} else if (value instanceof Timestamp) {
final DateTimeFormatter formatter = ISODateTimeFormat.dateTime().withZoneUTC();
rec.put(i - 1, formatter.print(new DateTime(((Timestamp) value).getTime())));
} else {
// The different types that we support are numbers (int, long, double, float),
// as well as boolean values and Strings. Since Avro doesn't provide
// timestamp types, we want to convert those to Strings. So we will cast anything other
// than numbers or booleans to strings by using the toString() method.
rec.put(i - 1, value.toString());
}
// notify the visitor
if (javaSqlType == Types.DATE || javaSqlType == Types.TIMESTAMP) {
Timestamp sqlDate = null;
try {
// Extract timestamp
sqlDate = extractSqlDate(rs, i);
} catch (Exception e) {
if (dateConversionWarning++ < 10) {
log.warn("{} is not convertible to timestamp or date", rs.getMetaData().getColumnName(i));
}
}
if (visitor != null) {
visitor.visitColumn(rs.getMetaData().getColumnName(i), javaSqlType, sqlDate);
}
} else if (javaSqlType == Types.TIME) {
Time time = rs.getTime(i);
if (visitor != null) {
visitor.visitColumn(rs.getMetaData().getColumnName(i), javaSqlType, time);
}
} else {
if (visitor != null) {
visitor.visitColumn(rs.getMetaData().getColumnName(i), javaSqlType, (value != null) ? value.toString() : null);
}
}
}
dataFileWriter.append(rec);
nrOfRows += 1;
}
return nrOfRows;
}
}
use of org.apache.avro.file.DataFileWriter in project kylo by Teradata.
the class JdbcCommon method convertToAvroStream.
/**
* converts a JDBC result set to an Avro stream
*
* @param rs The result set of the JDBC query
* @param outStream The output stream to for the Avro formatted records
* @return the number of rows converted to Avro format
* @throws SQLException if errors occur while reading data from the database
* @throws IOException if unable to convert to Avro format
*/
public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream) throws SQLException, IOException {
final Schema schema = createSchema(rs);
final GenericRecord rec = new GenericData.Record(schema);
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, outStream);
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
long nrOfRows = 0;
while (rs.next()) {
for (int i = 1; i <= nrOfColumns; i++) {
final int javaSqlType = meta.getColumnType(i);
final Object value = rs.getObject(i);
if (value == null) {
rec.put(i - 1, null);
} else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY || javaSqlType == ARRAY || javaSqlType == BLOB || javaSqlType == CLOB) {
// bytes requires little bit different handling
byte[] bytes = rs.getBytes(i);
ByteBuffer bb = ByteBuffer.wrap(bytes);
rec.put(i - 1, bb);
} else if (value instanceof Byte) {
// tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
// But value is returned by JDBC as java.lang.Byte
// (at least H2 JDBC works this way)
// direct put to avro record results:
// org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
rec.put(i - 1, ((Byte) value).intValue());
} else if (value instanceof BigDecimal || value instanceof BigInteger) {
// Avro can't handle BigDecimal and BigInteger as numbers - it will throw an AvroRuntimeException such as: "Unknown datum type: java.math.BigDecimal: 38"
rec.put(i - 1, value.toString());
} else if (value instanceof Number || value instanceof Boolean) {
rec.put(i - 1, value);
} else {
// The different types that we support are numbers (int, long, double, float),
// as well as boolean values and Strings. Since Avro doesn't provide
// timestamp types, we want to convert those to Strings. So we will cast anything other
// than numbers or booleans to strings by using the toString() method.
rec.put(i - 1, value.toString());
}
}
dataFileWriter.append(rec);
nrOfRows += 1;
}
return nrOfRows;
}
}
Aggregations