use of org.broadinstitute.hdf5.HDF5File in project gatk-protected by broadinstitute.
the class CreatePanelOfNormalsIntegrationTest method assertBasicPoNAssumptions.
private void assertBasicPoNAssumptions(final File ponFile, final File initialTargetsFileUsedToCreatePoN) {
try (final HDF5File ponHDF5File = new HDF5File(ponFile)) {
final HDF5PCACoveragePoN pon = new HDF5PCACoveragePoN(ponHDF5File);
Assert.assertTrue(pon.getTargets().size() >= pon.getPanelTargets().size());
Assert.assertTrue(pon.getRawTargets().size() > pon.getTargets().size());
Assert.assertTrue(pon.getTargetNames().size() == pon.getTargets().size());
Assert.assertTrue(pon.getPanelTargetNames().size() == pon.getPanelTargetNames().size());
Assert.assertTrue(pon.getRawTargetNames().size() == pon.getRawTargetNames().size());
if (initialTargetsFileUsedToCreatePoN != null) {
final TargetCollection<Target> tc = TargetArgumentCollection.readTargetCollection(initialTargetsFileUsedToCreatePoN);
Assert.assertEquals(pon.getRawTargets().size(), tc.targetCount());
// Check that the raw targets are the same
Assert.assertTrue(IntStream.of(new IntRange(0, pon.getRawTargets().size() - 1).toArray()).boxed().map(i -> pon.getRawTargets().get(i).equals(tc.target(i))).allMatch(t -> t));
}
}
}
use of org.broadinstitute.hdf5.HDF5File in project gatk by broadinstitute.
the class AllelicPanelOfNormals method read.
/**
* Reads an allelic panel of normals from an HDF5 or tab-separated file (file type is automatically detected).
* Tab-separated files should have global hyperparameter values alpha and beta specified by comment lines
* denoted by {@code GLOBAL_ALPHA_COMMENT_STRING} and {@code GLOBAL_BETA_COMMENT_STRING}:
* <p>
* #GLOBAL_ALPHA=...<br>
* #GLOBAL_BETA=...
* </p>
* followed by lines specifying hyperparameter values at each site,
* with column headers as in {@link AllelicPanelOfNormalsTableColumn}:
* <p>
* CONTIG \t POSITION \t ALPHA \t BETA
* </p>
*
* Note that we opt for a static read method as opposed to a constructor that takes a file parameter because
* the former allows us to return the static {@code EMPTY_PON} if the allelic panel of normals is not present in
* an HDF5 file.
* @param inputFile HDF5 file containing a coverage panel of normals created by {@link CreatePanelOfNormals}
* and an allelic panel of normals created and set by {@link CreateAllelicPanelOfNormals}
* ({@code EMPTY_PON} is returned if the latter was never set), or a
* tab-separated file that contains global hyperparameters in comment lines and lines specifying hyperparameter values at each site
*/
public static AllelicPanelOfNormals read(final File inputFile) {
IOUtils.canReadFile(inputFile);
if (isHDF5File(inputFile)) {
//construct from HDF5 file
try (final HDF5File hdf5File = new HDF5File(inputFile)) {
final AllelicPanelOfNormals allelicPoN = HDF5AllelicPoNUtils.read(hdf5File);
logger.info(String.format("Loaded allelic panel of normals from HDF5 file: %s.", inputFile));
return allelicPoN;
}
} else {
//construct from TSV file
try (final AllelicPanelOfNormalsReader reader = new AllelicPanelOfNormalsReader(inputFile)) {
//parse comment lines for global hyperparameter values
final AllelicPanelOfNormals.HyperparameterValues globalHyperparameterValues = parseGlobalHyperparameterValues(inputFile);
//parse data lines for local hyperparameter values
final Map<SimpleInterval, HyperparameterValues> siteToHyperparameterValuesMap = new HashMap<>();
reader.stream().forEach(s -> siteToHyperparameterValuesMap.put(s.getKey(), s.getValue()));
final AllelicPanelOfNormals allelicPoN = new AllelicPanelOfNormals(globalHyperparameterValues, siteToHyperparameterValuesMap);
logger.info(String.format("Loaded allelic panel of normals from TSV file: %s.", inputFile));
return allelicPoN;
} catch (final IOException | UncheckedIOException ex) {
throw new UserException.CouldNotReadInputFile(inputFile, ex);
}
}
}
use of org.broadinstitute.hdf5.HDF5File in project gatk by broadinstitute.
the class HDF5PCACoveragePoN method write.
/**
* Write all of the coverage PoN fields to HDF5.
*/
public static void write(final File outFile, final HDF5File.OpenMode openMode, final List<Target> rawTargets, final ReadCountCollection normalizedCounts, final ReadCountCollection logNormalizedCounts, final double[] targetFactors, final double[] targetVariances, final ReductionResult reduction) {
Utils.nonNull(outFile);
Utils.nonNull(normalizedCounts);
Utils.nonNull(logNormalizedCounts);
Utils.nonNull(rawTargets);
Utils.nonNull(targetFactors);
Utils.nonNull(targetVariances);
Utils.nonNull(reduction);
try (final HDF5File file = new HDF5File(outFile, openMode)) {
logger.info("Creating " + outFile.getAbsolutePath() + "...");
final HDF5PCACoveragePoN pon = new HDF5PCACoveragePoN(file);
logger.info("Setting version number (" + CURRENT_PON_VERSION + ")...");
pon.setVersion(CURRENT_PON_VERSION);
final List<Target> targets = normalizedCounts.targets();
final List<Target> panelTargets = logNormalizedCounts.targets();
logger.info("Setting targets ...");
pon.setTargets(targets);
logger.info("Setting raw targets ...");
pon.setRawTargets(rawTargets);
logger.info("Setting reduced panel targets ...");
pon.setPanelTargets(panelTargets);
logger.info("Setting target factors (" + targetFactors.length + ") ...");
pon.setTargetFactors(targetFactors);
logger.info("Setting target variances...");
pon.setTargetVariances(targetVariances);
logger.info("Setting normalized counts (" + normalizedCounts.counts().getRowDimension() + " x " + normalizedCounts.counts().getColumnDimension() + ") (T)...");
pon.setNormalizedCounts(normalizedCounts.counts());
logger.info("Setting log-normalized counts (" + logNormalizedCounts.counts().getRowDimension() + " x " + logNormalizedCounts.counts().getColumnDimension() + ") (T) ...");
pon.setLogNormalizedCounts(logNormalizedCounts.counts());
logger.info("Setting log-normalized pseudoinverse (" + reduction.getPseudoInverse().getRowDimension() + " x " + reduction.getPseudoInverse().getColumnDimension() + ") ...");
pon.setLogNormalizedPInverseCounts(reduction.getPseudoInverse());
logger.info("Setting reduced panel counts (" + reduction.getReducedCounts().getRowDimension() + " x " + reduction.getReducedCounts().getColumnDimension() + ") (T) ...");
pon.setReducedPanelCounts(reduction.getReducedCounts());
logger.info("Setting reduced panel pseudoinverse (" + reduction.getReducedPseudoInverse().getRowDimension() + " x " + reduction.getReducedPseudoInverse().getColumnDimension() + ") ...");
pon.setReducedPanelPInverseCounts(reduction.getReducedPseudoInverse());
final List<String> targetNames = normalizedCounts.targets().stream().map(Target::getName).collect(Collectors.toList());
final List<String> rawTargetNames = rawTargets.stream().map(Target::getName).collect(Collectors.toList());
final List<String> panelTargetNames = logNormalizedCounts.targets().stream().map(Target::getName).collect(Collectors.toList());
logger.info("Setting target names ...");
pon.setTargetNames(targetNames);
logger.info("Setting raw target names ...");
pon.setRawTargetNames(rawTargetNames);
logger.info("Setting reduced target names ...");
pon.setPanelTargetNames(panelTargetNames);
final List<String> sampleNames = normalizedCounts.columnNames();
final List<String> panelSampleNames = logNormalizedCounts.columnNames();
logger.info("Setting sample names ...");
pon.setSampleNames(sampleNames);
logger.info("Setting reduced sample names ...");
pon.setPanelSampleNames(panelSampleNames);
}
}
use of org.broadinstitute.hdf5.HDF5File in project gatk by broadinstitute.
the class HDF5PCACoveragePoNCreationUtils method redoReduction.
/**
* Creates a new PoN file using values from a given PoN file, except for reduction. Does the reduction step a
* second time and populates a new PoN file.
*
* Please note that this code will redo the reduction even if the number of eigensamples is the same in both the input and desired output.
*
* @param ctx {@code null} is okay if not using Spark
* @param newNumberOfEigensamples desired number of eigensamples in the new PoN reduction.
* @param inputHDF5Filename input PoN file
* @param outputHDF5Filename output PoN file, which will be the same as the input, except for fields regarding the PoN reduction
* @param openMode desired {@link HDF5File.OpenMode} (if {@code HDF5File.OpenMode.READ_ONLY}, an exception will be thrown)
*/
public static void redoReduction(final JavaSparkContext ctx, final OptionalInt newNumberOfEigensamples, final File inputHDF5Filename, final File outputHDF5Filename, final HDF5File.OpenMode openMode) {
Utils.nonNull(newNumberOfEigensamples);
IOUtils.canReadFile(inputHDF5Filename);
Utils.nonNull(outputHDF5Filename);
if (inputHDF5Filename.getAbsolutePath().equals(outputHDF5Filename.getAbsolutePath())) {
throw new UserException.CouldNotCreateOutputFile(outputHDF5Filename, "Cannot create a new PoN overwriting an old one.");
}
try (final HDF5File ponReader = new HDF5File(inputHDF5Filename, HDF5File.OpenMode.READ_ONLY)) {
final PCACoveragePoN inputPoN = new HDF5PCACoveragePoN(ponReader);
final ReadCountCollection normalizedCounts = new ReadCountCollection(inputPoN.getTargets(), inputPoN.getSampleNames(), inputPoN.getNormalizedCounts());
final ReadCountCollection logNormalizedCounts = new ReadCountCollection(inputPoN.getPanelTargets(), inputPoN.getPanelSampleNames(), inputPoN.getLogNormalizedCounts());
final ReductionResult newReduction = calculateReducedPanelAndPInverses(logNormalizedCounts, newNumberOfEigensamples, logger, ctx);
final List<String> panelTargetNames = logNormalizedCounts.targets().stream().map(Target::getName).collect(Collectors.toList());
final double[] targetVariances = calculateTargetVariances(normalizedCounts, panelTargetNames, newReduction, ctx);
HDF5PCACoveragePoN.write(outputHDF5Filename, openMode, inputPoN.getRawTargets(), normalizedCounts, logNormalizedCounts, inputPoN.getTargetFactors(), targetVariances, newReduction);
}
}
use of org.broadinstitute.hdf5.HDF5File in project gatk by broadinstitute.
the class HDF5LibraryUnitTest method testMakeNaNDouble.
@Test()
public void testMakeNaNDouble() throws IOException {
final File testFile = File.createTempFile("hdf5", ".hd5");
HDF5File file = new HDF5File(testFile, HDF5File.OpenMode.CREATE);
file.makeGroup("test-group/double-group");
Assert.assertTrue(file.makeDouble("test-group/double-group/my-double", Double.NaN));
System.err.println(testFile);
file.close();
final long time = System.currentTimeMillis();
Assert.assertTrue(testFile.length() > 0);
Assert.assertTrue(testFile.lastModified() <= time);
file = new HDF5File(testFile, HDF5File.OpenMode.READ_ONLY);
final double theDouble = file.readDouble("test-group/double-group/my-double");
Assert.assertTrue(Double.isNaN(theDouble));
file.close();
}
Aggregations