use of org.broadinstitute.hellbender.utils.text.XReadLines in project gatk by broadinstitute.
the class AlleleBiasedDownsamplingUtils method loadContaminationFile.
/**
* Create sample-contamination maps from file.
* The format is: tab-separated with no header,
* each line is: sampleID contaminationFraction
*
* @param file Filename containing two columns: SampleID and Contamination
* @param defaultContaminationFraction default contamination fraction, used for samples that do no specify one
* @param sampleIDs Set of Samples of interest (no reason to include every sample in file) or null to turn off checking
* @param logger for logging output
* @return sample-contamination Map. The returned map is a {@link DefaultedMap} that defaults to the defaultContaminationFraction for unspecified samples
* @throws UserException if there's an IO problem reading the file.
* @throws UserException if the file is malformed
*/
public static DefaultedMap<String, Double> loadContaminationFile(final File file, final double defaultContaminationFraction, final Set<String> sampleIDs, final Logger logger) {
final DefaultedMap<String, Double> sampleContamination = new DefaultedMap<>(defaultContaminationFraction);
final Set<String> nonSamplesInContaminationFile = new LinkedHashSet<>(sampleContamination.keySet());
try (final XReadLines reader = new XReadLines(file, true)) {
for (final String line : reader) {
if (line.isEmpty()) {
continue;
}
final String[] fields = line.split("\t");
if (fields.length != 2) {
throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line);
}
if (fields[0].isEmpty() || fields[1].isEmpty()) {
throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line);
}
final double contamination;
try {
contamination = Double.parseDouble(fields[1]);
} catch (final NumberFormatException e) {
throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line);
}
final String sampleName = fields[0];
if (sampleContamination.containsKey(sampleName)) {
throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + sampleName);
}
if (contamination < 0.0 || contamination > 1.0) {
throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line);
}
if (sampleIDs == null || sampleIDs.contains(sampleName)) {
sampleContamination.put(sampleName, contamination);
} else {
nonSamplesInContaminationFile.add(sampleName);
}
}
//output to the user info lines telling which samples are in the Contamination File
if (!sampleContamination.isEmpty()) {
logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString()));
//output to the user info lines telling which samples are NOT in the Contamination File
if (sampleIDs != null) {
final Set<String> samplesNotInContaminationFile = Sets.difference(sampleIDs, sampleContamination.keySet());
if (!samplesNotInContaminationFile.isEmpty()) {
logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString()));
}
}
}
//output to the user Samples that do not have lines in the Contamination File
if (!nonSamplesInContaminationFile.isEmpty()) {
logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString()));
}
return sampleContamination;
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile("I/O Error while reading sample-contamination file " + file.getAbsolutePath() + ": " + e.getMessage());
}
}
use of org.broadinstitute.hellbender.utils.text.XReadLines in project gatk by broadinstitute.
the class IntervalUtils method intervalFileToList.
/**
* Read a file of genome locations to process. The file may be in Picard
* or GATK interval format.
*
* @param glParser GenomeLocParser
* @param fileName interval file
* @return List<GenomeLoc> List of Genome Locs that have been parsed from file
*/
public static List<GenomeLoc> intervalFileToList(final GenomeLocParser glParser, final String fileName) {
Utils.nonNull(glParser, "glParser is null");
Utils.nonNull(fileName, "file name is null");
final File inputFile = new File(fileName);
final List<GenomeLoc> ret = new ArrayList<>();
/**
* First try to read the file as a Picard interval file since that's well structured --
* we'll fail quickly if it's not a valid file.
*/
boolean isPicardInterval = false;
try {
// Note: Picard will skip over intervals with contigs not in the sequence dictionary
final IntervalList il = IntervalList.fromFile(inputFile);
isPicardInterval = true;
for (final Interval interval : il.getIntervals()) {
// https://github.com/broadinstitute/gatk/issues/2089
if (interval.getStart() - interval.getEnd() == 1) {
logger.warn("Ignoring possibly incorrectly converted length 1 interval : " + interval);
} else if (glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true)) {
ret.add(glParser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getEnd(), true));
} else {
throw new UserException(inputFile.getAbsolutePath() + " has an invalid interval : " + interval);
}
}
}// if that didn't work, try parsing file as a GATK interval file
catch (final Exception e) {
if (// definitely a picard file, but we failed to parse
isPicardInterval) {
throw new UserException.CouldNotReadInputFile(inputFile, e);
} else {
try (XReadLines reader = new XReadLines(new File(fileName))) {
for (final String line : reader) {
if (!line.trim().isEmpty()) {
ret.add(glParser.parseGenomeLoc(line));
}
}
} catch (final IOException e2) {
throw new UserException.CouldNotReadInputFile(inputFile, e2);
}
}
}
if (ret.isEmpty()) {
throw new UserException.MalformedFile(new File(fileName), "It contains no intervals.");
}
return ret;
}
use of org.broadinstitute.hellbender.utils.text.XReadLines in project gatk by broadinstitute.
the class ClipReadsIntegrationTest method testClipper.
@Test(dataProvider = "clipOptions")
public void testClipper(String inBam, String reference, String extension, String option, String optAbrv, boolean doStats) throws IOException {
final String tmpBAMOutName = BaseTest.createTempFile(inBam + "." + optAbrv, extension).getAbsolutePath();
String tmpStatOutName = null;
if (doStats) {
tmpStatOutName = BaseTest.createTempFile(inBam + "." + optAbrv, ".tmp").getAbsolutePath();
}
final List<String> args = new ArrayList<>();
args.addAll(Arrays.<String>asList("--input", new File(localTestData, inBam + extension).getAbsolutePath(), "--output", tmpBAMOutName));
if (doStats) {
args.addAll(Arrays.<String>asList("-os", tmpStatOutName));
}
File referenceFile = null;
if (reference != null) {
referenceFile = new File(getTestDataDir(), reference);
args.add("-R");
args.add(referenceFile.getAbsolutePath());
}
args.addAll(Arrays.asList(option.split("\\s+")));
final ClipReads.ClippingData res = (ClipReads.ClippingData) this.runCommandLine(args);
System.out.println(res);
final File outFileBam = new File(tmpBAMOutName);
final File expectedOutBam = new File(localTestData, "expected." + inBam + "." + optAbrv + extension);
Assert.assertTrue(expectedOutBam.exists(), "expected output read file exists " + expectedOutBam.getAbsolutePath());
Assert.assertTrue(outFileBam.exists(), "actual output read file exists " + outFileBam.getAbsolutePath());
SamAssertionUtils.assertSamsEqual(expectedOutBam, outFileBam, referenceFile);
if (doStats) {
final File outFileStat = new File(tmpStatOutName);
final File expectedOutStat = new File(localTestData, "expected." + inBam + "." + optAbrv + ".tmp");
Assert.assertTrue(expectedOutStat.exists(), "expected output stat file exists " + expectedOutStat.getAbsolutePath());
Assert.assertTrue(outFileStat.exists(), "actual output stat file exists " + outFileStat.getAbsolutePath());
List<String> actualLines = new XReadLines(new File(tmpStatOutName)).readLines();
List<String> expectedLines = new XReadLines(expectedOutStat).readLines();
Assert.assertEquals(actualLines.toString(), expectedLines.toString());
}
}
use of org.broadinstitute.hellbender.utils.text.XReadLines in project gatk by broadinstitute.
the class CountReadsSparkIntegrationTest method testCountReadsWithIntervals.
@Test(dataProvider = "intervals", groups = "spark")
public void testCountReadsWithIntervals(final String interval_args, final long expectedCount) throws Exception {
final File ORIG_BAM = new File(getTestDataDir(), "count_reads_sorted.bam");
final File outputFile = createTempFile("count_reads_spark", "count");
ArgumentsBuilder args = new ArgumentsBuilder();
args.addInput(ORIG_BAM);
args.add(interval_args);
args.addOutput(outputFile);
this.runCommandLine(args.getArgsArray());
try (XReadLines output = new XReadLines(outputFile)) {
Assert.assertEquals((long) Long.valueOf(output.next()), expectedCount);
}
}
use of org.broadinstitute.hellbender.utils.text.XReadLines in project gatk by broadinstitute.
the class TrancheUnitTest method readData.
private ArrayList<VariantDatum> readData() throws IOException {
ArrayList<VariantDatum> vd = new ArrayList<>();
try (XReadLines xrl = new XReadLines(QUAL_DATA, true)) {
for (String line : xrl) {
String[] parts = line.split("\t");
// QUAL,TRANSITION,ID,LOD,FILTER
if (!parts[0].equals("QUAL")) {
VariantDatum datum = new VariantDatum();
datum.lod = Double.valueOf(parts[3]);
datum.isTransition = parts[1].equals("1");
datum.isKnown = !parts[2].equals(".");
datum.isSNP = true;
datum.atTruthSite = datum.isKnown;
vd.add(datum);
}
}
}
return vd;
}
Aggregations