use of org.apache.commons.collections4.map.DefaultedMap in project gatk by broadinstitute.
the class AlleleBiasedDownsamplingUtils method loadContaminationFile.
/**
* Create sample-contamination maps from file.
* The format is: tab-separated with no header,
* each line is: sampleID contaminationFraction
*
* @param file Filename containing two columns: SampleID and Contamination
* @param defaultContaminationFraction default contamination fraction, used for samples that do no specify one
* @param sampleIDs Set of Samples of interest (no reason to include every sample in file) or null to turn off checking
* @param logger for logging output
* @return sample-contamination Map. The returned map is a {@link DefaultedMap} that defaults to the defaultContaminationFraction for unspecified samples
* @throws UserException if there's an IO problem reading the file.
* @throws UserException if the file is malformed
*/
public static DefaultedMap<String, Double> loadContaminationFile(final File file, final double defaultContaminationFraction, final Set<String> sampleIDs, final Logger logger) {
final DefaultedMap<String, Double> sampleContamination = new DefaultedMap<>(defaultContaminationFraction);
final Set<String> nonSamplesInContaminationFile = new LinkedHashSet<>(sampleContamination.keySet());
try (final XReadLines reader = new XReadLines(file, true)) {
for (final String line : reader) {
if (line.isEmpty()) {
continue;
}
final String[] fields = line.split("\t");
if (fields.length != 2) {
throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line);
}
if (fields[0].isEmpty() || fields[1].isEmpty()) {
throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line);
}
final double contamination;
try {
contamination = Double.parseDouble(fields[1]);
} catch (final NumberFormatException e) {
throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line);
}
final String sampleName = fields[0];
if (sampleContamination.containsKey(sampleName)) {
throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + sampleName);
}
if (contamination < 0.0 || contamination > 1.0) {
throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line);
}
if (sampleIDs == null || sampleIDs.contains(sampleName)) {
sampleContamination.put(sampleName, contamination);
} else {
nonSamplesInContaminationFile.add(sampleName);
}
}
//output to the user info lines telling which samples are in the Contamination File
if (!sampleContamination.isEmpty()) {
logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString()));
//output to the user info lines telling which samples are NOT in the Contamination File
if (sampleIDs != null) {
final Set<String> samplesNotInContaminationFile = Sets.difference(sampleIDs, sampleContamination.keySet());
if (!samplesNotInContaminationFile.isEmpty()) {
logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString()));
}
}
}
//output to the user Samples that do not have lines in the Contamination File
if (!nonSamplesInContaminationFile.isEmpty()) {
logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString()));
}
return sampleContamination;
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile("I/O Error while reading sample-contamination file " + file.getAbsolutePath() + ": " + e.getMessage());
}
}
Aggregations