Search in sources :

Example 11 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleInversionFromLongCtg1WithStrangeLeftBreakpoint.

private static Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String> forSimpleInversionFromLongCtg1WithStrangeLeftBreakpoint() throws IOException {
    // inversion with strange left breakpoint
    final byte[] contigSequence = LONG_CONTIG1.getBytes();
    final AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval(chrForLongContig1, 20138007, 20142231), 1, contigSequence.length - 1986, TextCigarCodec.decode("1986S236M2D1572M1I798M5D730M1I347M4I535M"), false, 60, 36);
    final AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval(chrForLongContig1, 20152030, 20154634), 3604, contigSequence.length, TextCigarCodec.decode("3603H24M1I611M1I1970M"), true, 60, 36);
    final AlignedContig alignedContig = new AlignedContig("asm702700:tig00001", contigSequence, Arrays.asList(region1, region2));
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(ChimericAlignment.fromSplitAlignments(alignedContig, SVConstants.DiscoveryStepConstants.DEFAULT_MIN_ALIGNMENT_LENGTH).get(0), contigSequence);
    return new Tuple4<>(region1, region2, breakpoints, "asm702700:tig00001");
}
Also used : Tuple4(scala.Tuple4) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 12 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forComplexTandemDuplication.

/**
     * These test data was based on a real observation on a locally-assembled contig
     * "TGCCAGGTTACATGGCAAAGAGGGTAGATATGGGGAGCTGTGAAGAATGGAGCCAGTAATTAAATTCACTGAAGTCTCCACAGGAGGGCAAGGTGGACAATCTGTCCCATAGGAGGGGGATTCATGAGGGGAGCTGTGAAGAATGGAGCCAGTAATTAAATTCACTGAAGTCTCCACAGGAGGGCAAGGTGGACAATCTGTCCCATAGGAGGGGGATTCAGGAGGGCAGCTGTGGATGGTGCAAATGCCATTTATGCTCCTCTCCACCCATATCC"
     * with two alignment records chr18:312579-312718 140M135S
     *                            chr18:312610-312757 127S148M
     * for a tandem repeat expansion event from 1 copy to 2 copies with also a pseudo-homologyForwardStrandRep

     * Return a list of eight entries for positive and reverse strand representations for:
     * 1. expansion from 1 unit to 2 units with pseudo-homology
     * 2. contraction from 2 units to 1 unit with pseudo-homology
     * 3. contraction from 3 units to 2 units without pseudo-homology
     * 4. expansion from 2 units to 3 units without pseudo-homology
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forComplexTandemDuplication() throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // 31
    final String leftRefFlank = "TGCCAGGTTACATGGCAAAGAGGGTAGATAT";
    // 39
    final String rightRefFlank = "TGGTGCAAATGCCATTTATGCTCCTCTCCACCCATATCC";
    // 96
    final String firstRepeat = "GGGGAGCTGTGAAGAATGGAGCCAGTAATTAAATTCACTGAAGTCTCCACAGGAGGGCAAGGTGGACAATCTGTCCCATAGGAGGGGGATTCATGA";
    // 96
    final String secondRepeat = "GGGGAGCTGTGAAGAATGGAGCCAGTAATTAAATTCACTGAAGTCTCCACAGGAGGGCAAGGTGGACAATCTGTCCCATAGGAGGGGGATTCAGGA";
    // 13
    final String pseudoHomology = "GGGCAGCTGTGGA";
    // first test (the original observed event, but assigned to a different chromosome): expansion from 1 unit to 2 units with pseudo-homology
    final byte[] fakeRefSeqForComplexExpansionWithPseudoHomology = String.format("%s%s%s%s", leftRefFlank, firstRepeat, pseudoHomology, rightRefFlank).getBytes();
    final byte[] contigSeqForComplexExpansionWithPseudoHomology = String.format("%s%s%s%s%s", leftRefFlank, firstRepeat, secondRepeat, pseudoHomology, rightRefFlank).getBytes();
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312718), 1, 140, TextCigarCodec.decode("140M135S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312610, 312757), 128, 275, TextCigarCodec.decode("127S148M"), true, 60, 0);
    NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexExpansionWithPseudoHomology);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    final byte[] fakeRefSeqForComplexExpansionWithPseudoHomology_reverseStrand = Arrays.copyOf(fakeRefSeqForComplexExpansionWithPseudoHomology, fakeRefSeqForComplexExpansionWithPseudoHomology.length);
    final byte[] contigSeqForComplexExpansionWithPseudoHomology_reverseStrand = Arrays.copyOf(contigSeqForComplexExpansionWithPseudoHomology, contigSeqForComplexExpansionWithPseudoHomology.length);
    SequenceUtil.reverseComplement(fakeRefSeqForComplexExpansionWithPseudoHomology_reverseStrand);
    SequenceUtil.reverseComplement(contigSeqForComplexExpansionWithPseudoHomology_reverseStrand);
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312610, 312757), 1, 148, TextCigarCodec.decode("148M127S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312718), 136, 275, TextCigarCodec.decode("135S140M"), false, 60, 0);
    breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexExpansionWithPseudoHomology_reverseStrand);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // second test: contraction from 2 units to 1 unit with pseudo-homology
    final byte[] fakeRefSeqForComplexContractionWithPseudoHomology = contigSeqForComplexExpansionWithPseudoHomology;
    final byte[] contigSeqForComplexContractionWithPseudoHomology = fakeRefSeqForComplexExpansionWithPseudoHomology;
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312718), 1, 140, TextCigarCodec.decode("140M39S"), true, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312706, 312853), 32, 179, TextCigarCodec.decode("31S148M"), true, 60, 0);
    breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexContractionWithPseudoHomology);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    final byte[] fakeRefSeqForComplexContractionWithPseudoHomology_reverseStrand = Arrays.copyOf(fakeRefSeqForComplexContractionWithPseudoHomology, fakeRefSeqForComplexContractionWithPseudoHomology.length);
    final byte[] contigSeqForComplexContractionWithPseudoHomology_reverseStrand = Arrays.copyOf(contigSeqForComplexContractionWithPseudoHomology, contigSeqForComplexContractionWithPseudoHomology.length);
    SequenceUtil.reverseComplement(fakeRefSeqForComplexContractionWithPseudoHomology_reverseStrand);
    SequenceUtil.reverseComplement(contigSeqForComplexContractionWithPseudoHomology_reverseStrand);
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312706, 312853), 1, 148, TextCigarCodec.decode("148M31S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312718), 40, 179, TextCigarCodec.decode("39S140M"), false, 60, 0);
    breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexContractionWithPseudoHomology_reverseStrand);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // third test: contraction from 3 units to 2 units without pseudo-homology
    final byte[] fakeRefSeqForComplexContractionNoPseudoHomology = String.format("%s%s%s%s%s", leftRefFlank, firstRepeat, secondRepeat, firstRepeat, rightRefFlank).getBytes();
    final byte[] contigSeqForComplexContractionNoPseudoHomology = String.format("%s%s%s%s", leftRefFlank, firstRepeat, secondRepeat, rightRefFlank).getBytes();
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312801), 1, 223, TextCigarCodec.decode("223M39S"), true, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312706, 312936), 32, 262, TextCigarCodec.decode("31S231M"), true, 60, 0);
    breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexContractionNoPseudoHomology);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    final byte[] fakeRefSeqForComplexContractionNoPseudoHomology_reverseStrand = Arrays.copyOf(fakeRefSeqForComplexContractionNoPseudoHomology, fakeRefSeqForComplexContractionNoPseudoHomology.length);
    final byte[] contigSeqForComplexContractionNoPseudoHomology_reverseStrand = Arrays.copyOf(contigSeqForComplexContractionNoPseudoHomology, contigSeqForComplexContractionNoPseudoHomology.length);
    SequenceUtil.reverseComplement(fakeRefSeqForComplexContractionNoPseudoHomology_reverseStrand);
    SequenceUtil.reverseComplement(contigSeqForComplexContractionNoPseudoHomology_reverseStrand);
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312706, 312936), 1, 231, TextCigarCodec.decode("231M31S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312801), 40, 262, TextCigarCodec.decode("39S223M"), false, 60, 0);
    breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexContractionNoPseudoHomology_reverseStrand);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // fourth test: expansion from 2 units to 3 units without pseudo-homology
    final byte[] fakeRefSeqForComplexExpansionNoPseudoHomology = contigSeqForComplexContractionNoPseudoHomology;
    final byte[] contigSeqForComplexExpansionNoPseudoHomology = fakeRefSeqForComplexContractionNoPseudoHomology;
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312801), 1, 223, TextCigarCodec.decode("223M135S"), true, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312610, 312840), 128, 358, TextCigarCodec.decode("127S231M"), true, 60, 0);
    breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexExpansionNoPseudoHomology);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    final byte[] fakeRefSeqForComplexExpansionNoPseudoHomology_reverseStrand = Arrays.copyOf(fakeRefSeqForComplexExpansionNoPseudoHomology, fakeRefSeqForComplexExpansionNoPseudoHomology.length);
    final byte[] contigSeqForComplexExpansionNoPseudoHomology_reverseStrand = Arrays.copyOf(contigSeqForComplexExpansionNoPseudoHomology, contigSeqForComplexExpansionNoPseudoHomology.length);
    SequenceUtil.reverseComplement(fakeRefSeqForComplexExpansionNoPseudoHomology_reverseStrand);
    SequenceUtil.reverseComplement(contigSeqForComplexExpansionNoPseudoHomology_reverseStrand);
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312610, 312840), 1, 231, TextCigarCodec.decode("231M127S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("20", 312579, 312801), 136, 358, TextCigarCodec.decode("135S223M"), false, 60, 0);
    breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeqForComplexExpansionNoPseudoHomology_reverseStrand);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 13 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleTandemDuplicationExpansion.

/**
     * 40-'A' + 10-'C' + 40-'G' is expanded to 40-'A' + 20-'C' + 40-'G' (forward strand representation)
     * Return a list of two entries for positive and reverse strand representations.
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleTandemDuplicationExpansion(final ByteArrayOutputStream outputStream) throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // simple tandem duplication expansion '+' strand representation
    final byte[] leftRefFlank = makeDummySequence(40, (byte) 'A');
    final byte[] rightRefFlank = makeDummySequence(40, (byte) 'G');
    final byte[] doubleDup = makeDummySequence(20, (byte) 'C');
    outputStream.reset();
    outputStream.write(leftRefFlank);
    outputStream.write(doubleDup);
    outputStream.write(rightRefFlank);
    byte[] contigSeq = outputStream.toByteArray();
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100050), 1, 50, TextCigarCodec.decode("50M50S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100041, 100090), 51, 100, TextCigarCodec.decode("50S50M"), true, 60, 0);
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // simple tandem duplication expansion '-' strand representation
    SequenceUtil.reverseComplement(leftRefFlank);
    SequenceUtil.reverseComplement(rightRefFlank);
    SequenceUtil.reverseComplement(doubleDup);
    outputStream.reset();
    outputStream.write(rightRefFlank);
    outputStream.write(doubleDup);
    outputStream.write(leftRefFlank);
    contigSeq = outputStream.toByteArray();
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100041, 100090), 1, 50, TextCigarCodec.decode("50M50S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100050), 51, 100, TextCigarCodec.decode("50S50M"), false, 60, 0);
    final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 14 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleDeletion.

/**
     * 40-'A' + 10-'C'+10-'T' + 40-'G' where the segment 10-'C'+10-'T' is deleted (forward strand representation description).
     *
     * Return a list of two entries for positive and reverse strand representations.
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleDeletion(final ByteArrayOutputStream outputStream) throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // simple deletion '+' strand representation
    final byte[] leftRefFlank = makeDummySequence(40, (byte) 'A');
    final byte[] rightRefFlank = makeDummySequence(40, (byte) 'G');
    outputStream.reset();
    outputStream.write(leftRefFlank);
    outputStream.write(rightRefFlank);
    byte[] contigSeq = outputStream.toByteArray();
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100040), 1, 40, TextCigarCodec.decode("40M40S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100061, 100100), 41, 80, TextCigarCodec.decode("40S40M"), true, 60, 0);
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // simple deletion '-' strand representation
    SequenceUtil.reverseComplement(leftRefFlank);
    SequenceUtil.reverseComplement(rightRefFlank);
    outputStream.reset();
    outputStream.write(rightRefFlank);
    outputStream.write(leftRefFlank);
    contigSeq = outputStream.toByteArray();
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100061, 100100), 1, 40, TextCigarCodec.decode("40M40S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100040), 41, 80, TextCigarCodec.decode("40S40M"), false, 60, 0);
    final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

Tuple4 (scala.Tuple4)14 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)11 ArrayList (java.util.ArrayList)9 Comparator (java.util.Comparator)3 List (java.util.List)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 Function (org.apache.spark.api.java.function.Function)2 PairFunction (org.apache.spark.api.java.function.PairFunction)2 Tuple2 (scala.Tuple2)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Nullable (javax.annotation.Nullable)1 LongWritable (org.apache.hadoop.io.LongWritable)1 Text (org.apache.hadoop.io.Text)1 SequenceFileOutputFormat (org.apache.hadoop.mapred.SequenceFileOutputFormat)1 JavaDoubleRDD (org.apache.spark.api.java.JavaDoubleRDD)1 VoidFunction (org.apache.spark.api.java.function.VoidFunction)1