use of org.apache.avro.file.SeekableInput in project parquet-mr by apache.
the class SchemaCommand method getParquetSchema.
private String getParquetSchema(String source) throws IOException {
Formats.Format format;
try (SeekableInput in = openSeekable(source)) {
format = Formats.detectFormat((InputStream) in);
in.seek(0);
switch(format) {
case PARQUET:
try (ParquetFileReader reader = new ParquetFileReader(getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)) {
return reader.getFileMetaData().getSchema().toString();
}
default:
throw new IllegalArgumentException(String.format("Could not get a Parquet schema for format %s: %s", format, source));
}
}
}
use of org.apache.avro.file.SeekableInput in project parquet-mr by apache.
the class BaseCommand method getAvroSchema.
protected Schema getAvroSchema(String source) throws IOException {
Formats.Format format;
try (SeekableInput in = openSeekable(source)) {
format = Formats.detectFormat((InputStream) in);
in.seek(0);
switch(format) {
case PARQUET:
return Schemas.fromParquet(getConf(), qualifiedURI(source));
case AVRO:
return Schemas.fromAvro(open(source));
case TEXT:
if (source.endsWith("avsc")) {
return Schemas.fromAvsc(open(source));
} else if (source.endsWith("json")) {
return Schemas.fromJSON("json", open(source));
}
default:
}
throw new IllegalArgumentException(String.format("Could not determine file format of %s.", source));
}
}
use of org.apache.avro.file.SeekableInput in project incubator-gobblin by apache.
the class AvroExternalTable method getSchemaFromAvroDataFile.
private Schema getSchemaFromAvroDataFile() throws IOException {
String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs);
LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath);
SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput();
try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) {
Schema schema = dfr.getSchema();
return schema;
}
}
use of org.apache.avro.file.SeekableInput in project crunch by cloudera.
the class AvroRecordReader method initialize.
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
Configuration conf = context.getConfiguration();
SeekableInput in = new FsInput(split.getPath(), conf);
DatumReader<T> datumReader = null;
if (context.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true)) {
ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
datumReader = factory.getReader(schema);
} else {
datumReader = new SpecificDatumReader<T>(schema);
}
this.reader = DataFileReader.openReader(in, datumReader);
// sync to start
reader.sync(split.getStart());
this.start = reader.tell();
this.end = split.getStart() + split.getLength();
}
use of org.apache.avro.file.SeekableInput in project flink by apache.
the class AvroInputFormat method initReader.
private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
DatumReader<E> datumReader;
if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
datumReader = new GenericDatumReader<E>();
} else {
datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
}
if (LOG.isInfoEnabled()) {
LOG.info("Opening split {}", split);
}
SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);
if (LOG.isDebugEnabled()) {
LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
}
end = split.getStart() + split.getLength();
recordsReadSinceLastSync = 0;
return dataFileReader;
}
Aggregations