use of org.bdgenomics.formats.avro.AlignmentRecord in project gatk by broadinstitute.
the class GATKReadAdaptersUnitTest method basicReadBackedByADAMRecord.
private static GATKRead basicReadBackedByADAMRecord(final SAMRecord sam) {
final AlignmentRecord record = new AlignmentRecord();
record.setContigName(sam.getContig());
record.setRecordGroupSample(sam.getReadGroup().getSample());
record.setReadName(sam.getReadName());
record.setSequence(new String(sam.getReadBases()));
//ADAM records are 0-based
record.setStart((long) sam.getAlignmentStart() - 1);
//ADAM records are 0-based
record.setEnd((long) sam.getAlignmentEnd() - 1);
record.setReadMapped(!sam.getReadUnmappedFlag());
record.setCigar(sam.getCigarString());
return new BDGAlignmentRecordToGATKReadAdapter(record, getSAMHeader());
}
use of org.bdgenomics.formats.avro.AlignmentRecord in project gatk by broadinstitute.
the class ReadsSparkSink method writeReadsADAM.
private static void writeReadsADAM(final JavaSparkContext ctx, final String outputFile, final JavaRDD<SAMRecord> reads, final SAMFileHeader header) throws IOException {
final SequenceDictionary seqDict = SequenceDictionary.fromSAMSequenceDictionary(header.getSequenceDictionary());
final RecordGroupDictionary readGroups = RecordGroupDictionary.fromSAMHeader(header);
final JavaPairRDD<Void, AlignmentRecord> rddAlignmentRecords = reads.map(read -> {
read.setHeaderStrict(header);
AlignmentRecord alignmentRecord = GATKReadToBDGAlignmentRecordConverter.convert(read, seqDict, readGroups);
read.setHeaderStrict(null);
return alignmentRecord;
}).mapToPair(alignmentRecord -> new Tuple2<>(null, alignmentRecord));
// instantiating a Job is necessary here in order to set the Hadoop Configuration...
final Job job = Job.getInstance(ctx.hadoopConfiguration());
// ...here, which sets a config property that the AvroParquetOutputFormat needs when writing data. Specifically,
// we are writing the Avro schema to the Configuration as a JSON string. The AvroParquetOutputFormat class knows
// how to translate objects in the Avro data model to the Parquet primitives that get written.
AvroParquetOutputFormat.setSchema(job, AlignmentRecord.getClassSchema());
deleteHadoopFile(outputFile, ctx.hadoopConfiguration());
rddAlignmentRecords.saveAsNewAPIHadoopFile(outputFile, Void.class, AlignmentRecord.class, AvroParquetOutputFormat.class, job.getConfiguration());
}
use of org.bdgenomics.formats.avro.AlignmentRecord in project gatk by broadinstitute.
the class ReadsSparkSource method getADAMReads.
/**
* Loads ADAM reads stored as Parquet.
* @param inputPath path to the Parquet data
* @return RDD of (ADAM-backed) GATKReads from the file.
*/
public JavaRDD<GATKRead> getADAMReads(final String inputPath, final List<SimpleInterval> intervals, final SAMFileHeader header) throws IOException {
Job job = Job.getInstance(ctx.hadoopConfiguration());
AvroParquetInputFormat.setAvroReadSchema(job, AlignmentRecord.getClassSchema());
Broadcast<SAMFileHeader> bHeader;
if (header == null) {
bHeader = ctx.broadcast(null);
} else {
bHeader = ctx.broadcast(header);
}
@SuppressWarnings("unchecked") JavaRDD<AlignmentRecord> recordsRdd = ctx.newAPIHadoopFile(inputPath, AvroParquetInputFormat.class, Void.class, AlignmentRecord.class, job.getConfiguration()).values();
JavaRDD<GATKRead> readsRdd = recordsRdd.map(record -> new BDGAlignmentRecordToGATKReadAdapter(record, bHeader.getValue()));
JavaRDD<GATKRead> filteredRdd = readsRdd.filter(record -> samRecordOverlaps(record.convertToSAMRecord(header), intervals));
return putPairsInSamePartition(header, filteredRdd);
}
Aggregations