Search in sources :

Example 6 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleTandemDuplicationContraction.

/**
     * 40-'A' + 20-'C' + 40-'G' is shrunk to 40-'A' + 10-'C' + 40-'G' (forward strand representation)
     * Return a list of two entries for positive and reverse strand representations.
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleTandemDuplicationContraction() throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // simple tandem duplication contraction '+' strand representation
    final byte[] leftRefFlank = makeDummySequence(40, (byte) 'A');
    final byte[] rightRefFlank = makeDummySequence(40, (byte) 'G');
    final byte[] doubleDup = makeDummySequence(20, (byte) 'C');
    final byte[] contigSeq = new byte[90];
    System.arraycopy(leftRefFlank, 0, contigSeq, 0, 40);
    System.arraycopy(doubleDup, 0, contigSeq, 40, 10);
    System.arraycopy(rightRefFlank, 0, contigSeq, 50, 40);
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100050), 1, 50, TextCigarCodec.decode("50M40S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100051, 100100), 41, 100, TextCigarCodec.decode("40S50M"), true, 60, 0);
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // simple tandem duplication contraction '-' strand representation
    SequenceUtil.reverseComplement(leftRefFlank);
    SequenceUtil.reverseComplement(rightRefFlank);
    SequenceUtil.reverseComplement(doubleDup);
    System.arraycopy(rightRefFlank, 0, contigSeq, 0, 40);
    System.arraycopy(doubleDup, 0, contigSeq, 40, 10);
    System.arraycopy(leftRefFlank, 0, contigSeq, 50, 40);
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100051, 100100), 1, 50, TextCigarCodec.decode("50M40S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100050), 41, 100, TextCigarCodec.decode("40S50M"), false, 60, 0);
    final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 7 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forDeletionWithHomology.

/**
     * 40-'C' + 'ATCG' + 34 bases of unique sequence + 'ATCG' + 40-'T' is shrunk to 40-'C' + 'ATCG' + 40-'T' (forward strand representation)
     * Return a list of two entries for positive and reverse strand representations.
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forDeletionWithHomology(final ByteArrayOutputStream outputStream) throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // simple deletion with homology '+' strand representation
    final byte[] leftRefFlank = makeDummySequence(40, (byte) 'C');
    final byte[] rightRefFlank = makeDummySequence(40, (byte) 'T');
    final byte[] homology = new byte[] { 'A', 'T', 'C', 'G' };
    outputStream.reset();
    outputStream.write(leftRefFlank);
    outputStream.write(homology);
    outputStream.write(rightRefFlank);
    byte[] contigSeq = outputStream.toByteArray();
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100044), 1, 44, TextCigarCodec.decode("44M40S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100079, 100122), 41, 84, TextCigarCodec.decode("40S44M"), true, 60, 0);
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // simple deletion with homology '-' strand representation
    SequenceUtil.reverseComplement(leftRefFlank);
    SequenceUtil.reverseComplement(rightRefFlank);
    SequenceUtil.reverseComplement(homology);
    outputStream.reset();
    outputStream.write(rightRefFlank);
    outputStream.write(homology);
    outputStream.write(leftRefFlank);
    contigSeq = outputStream.toByteArray();
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100079, 100122), 1, 44, TextCigarCodec.decode("44M40S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100044), 41, 84, TextCigarCodec.decode("40S44M"), false, 60, 0);
    final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 8 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forLongRangeSubstitution.

/**
     * 50-'A' + 50-'C' where the middle 10-'A'+10-'C' is substituted with 10-'G' (forward strand representation)
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forLongRangeSubstitution() throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // long range substitution '+' strand representation
    final byte[] leftRefFlank = makeDummySequence(50, (byte) 'A');
    final byte[] rightRefFlank = makeDummySequence(50, (byte) 'G');
    final byte[] substitution = makeDummySequence(10, (byte) 'C');
    byte[] contigSeq = new byte[leftRefFlank.length + rightRefFlank.length - 10];
    System.arraycopy(leftRefFlank, 0, contigSeq, 0, 40);
    System.arraycopy(substitution, 0, contigSeq, 40, substitution.length);
    System.arraycopy(rightRefFlank, 0, contigSeq, 50, 40);
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100040), 1, 40, TextCigarCodec.decode("40M50S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100061, 100100), 51, 90, TextCigarCodec.decode("50S40M"), true, 60, 0);
    NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // long range substitution '-' strand representation
    SequenceUtil.reverseComplement(leftRefFlank);
    SequenceUtil.reverseComplement(rightRefFlank);
    SequenceUtil.reverseComplement(substitution);
    System.arraycopy(rightRefFlank, 0, contigSeq, 0, 40);
    System.arraycopy(substitution, 0, contigSeq, 40, substitution.length);
    System.arraycopy(leftRefFlank, 0, contigSeq, 40 + substitution.length, 40);
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100061, 100100), 1, 40, TextCigarCodec.decode("40M50S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100040), 51, 90, TextCigarCodec.decode("50S40M"), false, 60, 0);
    final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 9 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleInversionWithNovelInsertion_leftFlankingForwardStrandOnly.

private static Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String> forSimpleInversionWithNovelInsertion_leftFlankingForwardStrandOnly() throws IOException {
    // inversion with inserted sequence
    final byte[] leftFlank = makeDummySequence(146, (byte) 'A');
    final byte[] rightFlankRC = makeDummySequence(50, (byte) 'C');
    final byte[] contigSeq = new byte[leftFlank.length + 1 + rightFlankRC.length];
    System.arraycopy(leftFlank, 0, contigSeq, 0, leftFlank.length);
    contigSeq[leftFlank.length] = (byte) 'T';
    System.arraycopy(rightFlankRC, 0, contigSeq, leftFlank.length + 1, rightFlankRC.length);
    final AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 108569149, 108569294), 1, 146, TextCigarCodec.decode("146M51S"), true, 60, 0);
    final AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 108569315, 108569364), 148, 197, TextCigarCodec.decode("147S50M"), false, 60, 0);
    final AlignedContig alignedContig = new AlignedContig("asm000001:tig00001", contigSeq, Arrays.asList(region1, region2));
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), alignedContig.contigName), alignedContig.contigSequence);
    return new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001");
}
Also used : Tuple4(scala.Tuple4) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 10 with Tuple4

use of scala.Tuple4 in project gatk by broadinstitute.

the class SVDiscoveryTestDataProvider method forSimpleInsertion.

/**
     * 100-'A' + 100-'T' and a 50 bases of 'C' is inserted at the A->T junction point (forward strand description)
     * Return a list of two entries for positive and reverse strand representations.
     */
private static List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> forSimpleInsertion(final ByteArrayOutputStream outputStream) throws IOException {
    final List<Tuple4<AlignedAssembly.AlignmentInterval, AlignedAssembly.AlignmentInterval, NovelAdjacencyReferenceLocations, String>> result = new ArrayList<>();
    // simple insertion '+' strand representation
    final byte[] leftRefFlank = makeDummySequence(100, (byte) 'A');
    final byte[] insertedSeq = makeDummySequence(50, (byte) 'C');
    final byte[] rightRefFlank = makeDummySequence(100, (byte) 'T');
    outputStream.reset();
    outputStream.write(leftRefFlank);
    outputStream.write(insertedSeq);
    outputStream.write(rightRefFlank);
    byte[] contigSeq = outputStream.toByteArray();
    AlignedAssembly.AlignmentInterval region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100100), 1, 100, TextCigarCodec.decode("100M100S"), true, 60, 0);
    AlignedAssembly.AlignmentInterval region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100101, 100200), 151, 250, TextCigarCodec.decode("100S100M"), true, 60, 0);
    final NovelAdjacencyReferenceLocations breakpoints = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpoints, "asm000001:tig00001"));
    // simple insertion '-' strand representation
    SequenceUtil.reverseComplement(leftRefFlank);
    SequenceUtil.reverseComplement(rightRefFlank);
    SequenceUtil.reverseComplement(insertedSeq);
    outputStream.reset();
    outputStream.write(rightRefFlank);
    outputStream.write(insertedSeq);
    outputStream.write(leftRefFlank);
    contigSeq = outputStream.toByteArray();
    region1 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100101, 100200), 1, 100, TextCigarCodec.decode("100M100S"), false, 60, 0);
    region2 = new AlignedAssembly.AlignmentInterval(new SimpleInterval("21", 100001, 100100), 151, 250, TextCigarCodec.decode("100S100M"), false, 60, 0);
    final NovelAdjacencyReferenceLocations breakpointsDetectedFromReverseStrand = new NovelAdjacencyReferenceLocations(new ChimericAlignment(region1, region2, Collections.emptyList(), "asm000001:tig00001"), contigSeq);
    result.add(new Tuple4<>(region1, region2, breakpointsDetectedFromReverseStrand, "asm000001:tig00001"));
    return result;
}
Also used : Tuple4(scala.Tuple4) ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

Tuple4 (scala.Tuple4)14 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)11 ArrayList (java.util.ArrayList)9 Comparator (java.util.Comparator)3 List (java.util.List)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 Function (org.apache.spark.api.java.function.Function)2 PairFunction (org.apache.spark.api.java.function.PairFunction)2 Tuple2 (scala.Tuple2)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Nullable (javax.annotation.Nullable)1 LongWritable (org.apache.hadoop.io.LongWritable)1 Text (org.apache.hadoop.io.Text)1 SequenceFileOutputFormat (org.apache.hadoop.mapred.SequenceFileOutputFormat)1 JavaDoubleRDD (org.apache.spark.api.java.JavaDoubleRDD)1 VoidFunction (org.apache.spark.api.java.function.VoidFunction)1