Search in sources :

Example 81 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project chordatlas by twak.

the class SkelFootprint method findRoofColor.

private static void findRoofColor(HalfMesh2 mesh) {
    class Wrapper implements Clusterable {

        double[] col;

        public Wrapper(float[] col) {
            this.col = new double[] { col[0], col[1], col[2] };
        }

        @Override
        public double[] getPoint() {
            return col;
        }
    }
    for (HalfFace hf : mesh.faces) {
        List<Wrapper> toCluster = new ArrayList();
        SuperFace sf = (SuperFace) hf;
        if (sf.colors == null || sf.colors.isEmpty()) {
            float grey = (float) (Math.random() * 0.3 + 0.2);
            sf.roofColor = new float[] { grey, grey, grey };
            continue;
        }
        for (float[] v : sf.colors) toCluster.add(new Wrapper(v));
        sf.colors = null;
        DBSCANClusterer<Wrapper> cr = new DBSCANClusterer<>(0.2, 5);
        List<Cluster<Wrapper>> results = cr.cluster(toCluster);
        float[] col = new float[] { 0.3f, 0.3f, 0.3f };
        try {
            Cluster<Wrapper> biggest = results.stream().max((a, b) -> Double.compare(a.getPoints().size(), b.getPoints().size())).get();
            col = new float[3];
            for (Wrapper w : biggest.getPoints()) for (int i = 0; i < 3; i++) col[i] += (float) w.col[i];
            int size = biggest.getPoints().size();
            for (int i = 0; i < 3; i++) col[i] /= size;
        } catch (NoSuchElementException e) {
        }
        sf.roofColor = col;
    }
}
Also used : Color(java.awt.Color) XStream(com.thoughtworks.xstream.XStream) Arrays(java.util.Arrays) FloatBuffer(java.nio.FloatBuffer) HalfFace(org.twak.utils.geom.HalfMesh2.HalfFace) Type(com.jme3.scene.VertexBuffer.Type) Arrayz(org.twak.utils.collections.Arrayz) Loop(org.twak.utils.collections.Loop) Node(com.jme3.scene.Node) SkelGen(org.twak.tweed.gen.skel.SkelGen) MutableDouble(org.twak.utils.MutableDouble) ColorRGBAPainter(org.twak.viewTrace.ColorRGBAPainter) Map(java.util.Map) Cache(org.twak.utils.Cache) Material(com.jme3.material.Material) ChangeListener(javax.swing.event.ChangeListener) Streamz(org.twak.utils.collections.Streamz) Point3d(javax.vecmath.Point3d) ChangeEvent(javax.swing.event.ChangeEvent) VertexBuffer(com.jme3.scene.VertexBuffer) LoopL(org.twak.utils.collections.LoopL) Predicate(java.util.function.Predicate) Line(org.twak.utils.Line) HalfEdge(org.twak.utils.geom.HalfMesh2.HalfEdge) Set(java.util.Set) HalfMesh2(org.twak.utils.geom.HalfMesh2) CollisionResult(com.jme3.collision.CollisionResult) Vector2d(javax.vecmath.Vector2d) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) MFPoint(org.twak.tweed.gen.FeatureCache.MFPoint) List(java.util.List) JSlider(javax.swing.JSlider) Optional(java.util.Optional) Rainbow(org.twak.utils.ui.Rainbow) CollisionResults(com.jme3.collision.CollisionResults) Mesh(com.jme3.scene.Mesh) Geometry(com.jme3.scene.Geometry) IntStream(java.util.stream.IntStream) ActionListener(java.awt.event.ActionListener) DBSCANClusterer(org.apache.commons.math3.ml.clustering.DBSCANClusterer) Vector2f(com.jme3.math.Vector2f) HashMap(java.util.HashMap) MiniFacade(org.twak.viewTrace.facades.MiniFacade) Cach(org.twak.utils.Cach) Plot(org.twak.utils.ui.Plot) SwingConstants(javax.swing.SwingConstants) TreeSet(java.util.TreeSet) Clusterable(org.apache.commons.math3.ml.clustering.Clusterable) Tweed(org.twak.tweed.Tweed) IndexBuffer(com.jme3.scene.mesh.IndexBuffer) ArrayList(java.util.ArrayList) TweedSettings(org.twak.tweed.TweedSettings) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) Mathz(org.twak.utils.Mathz) PaintThing(org.twak.utils.PaintThing) NoSuchElementException(java.util.NoSuchElementException) ProgressMonitor(javax.swing.ProgressMonitor) LinkedHashSet(java.util.LinkedHashSet) MatParam(com.jme3.material.MatParam) JButton(javax.swing.JButton) Texture2D(com.jme3.texture.Texture2D) Iterator(java.util.Iterator) MultiMap(org.twak.utils.collections.MultiMap) BufferUtils(com.jme3.util.BufferUtils) Vector3f(com.jme3.math.Vector3f) MeshBuilder(org.twak.siteplan.jme.MeshBuilder) MegaFacade(org.twak.tweed.gen.ProfileGen.MegaFacade) ModeCollector(org.twak.viewTrace.ModeCollector) JOptionPane(javax.swing.JOptionPane) MegaFeatures(org.twak.tweed.gen.FeatureCache.MegaFeatures) ActionEvent(java.awt.event.ActionEvent) File(java.io.File) Loopz(org.twak.utils.collections.Loopz) Point2d(javax.vecmath.Point2d) Jme3z(org.twak.siteplan.jme.Jme3z) Ray(com.jme3.math.Ray) SuperLine(org.twak.viewTrace.SuperLine) LineHeight(org.twak.viewTrace.facades.LineHeight) Format(com.jme3.scene.VertexBuffer.Format) Cluster(org.apache.commons.math3.ml.clustering.Cluster) ColorRGBA(com.jme3.math.ColorRGBA) FileReader(java.io.FileReader) ImageRaster(com.jme3.texture.image.ImageRaster) Comparator(java.util.Comparator) Collections(java.util.Collections) ArrayList(java.util.ArrayList) Cluster(org.apache.commons.math3.ml.clustering.Cluster) HalfFace(org.twak.utils.geom.HalfMesh2.HalfFace) MFPoint(org.twak.tweed.gen.FeatureCache.MFPoint) Clusterable(org.apache.commons.math3.ml.clustering.Clusterable) DBSCANClusterer(org.apache.commons.math3.ml.clustering.DBSCANClusterer) NoSuchElementException(java.util.NoSuchElementException)

Example 82 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project systemml by apache.

the class DataGenMR method runJob.

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 *
 * @param inst MR job instruction
 * @param dataGenInstructions array of data gen instructions
 * @param instructionsInMapper instructions in mapper
 * @param aggInstructionsInReducer aggregate instructions in reducer
 * @param otherInstructionsInReducer other instructions in reducer
 * @param numReducers number of reducers
 * @param replication file replication
 * @param resultIndexes result indexes for each random object
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if Exception occurs
 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");
    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);
    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];
    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;
    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];
        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRType mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;
        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();
        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);
        if (mrtype == MRType.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                // for obj reuse and preventing repeated buffer re-allocations
                StringBuilder sb = new StringBuilder();
                // seed generation
                Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
                for (long r = 0; r < Math.max(rlens[i], 1); r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    for (long c = 0; c < Math.max(clens[i], 1); c += bclens[i]) {
                        long curBlockColSize = Math.min(bclens[i], (clens[i] - c));
                        sb.append((r / brlens[i]) + 1);
                        sb.append(',');
                        sb.append((c / bclens[i]) + 1);
                        sb.append(',');
                        sb.append(curBlockRowSize);
                        sb.append(',');
                        sb.append(curBlockColSize);
                        sb.append(',');
                        sb.append(bigrand.nextLong());
                        pw.println(sb.toString());
                        sb.setLength(0);
                        numblocks++;
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRType.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            // always dense
            maxsparsity = 1.0;
            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;
            // handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
            // Compute the number of rows in the sequence
            long numrows = UtilFunctions.getSeqLength(from, to, incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }
            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
            else
                clens[i] = 1;
            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                StringBuilder sb = new StringBuilder();
                double temp = from;
                double block_from, block_to;
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval)
                    long bid_i = ((r / brlens[i]) + 1);
                    long bid_j = 1;
                    block_from = temp;
                    block_to = temp + (curBlockRowSize - 1) * incr;
                    // next block starts from here
                    temp = block_to + incr;
                    sb.append(bid_i);
                    sb.append(',');
                    sb.append(bid_j);
                    sb.append(',');
                    sb.append(block_from);
                    sb.append(',');
                    sb.append(block_to);
                    sb.append(',');
                    sb.append(incr);
                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    // remove the first ","
    dataGenInsStr = dataGenInsStr.substring(1);
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        // set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        // set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
        // set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
        // set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
        // set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
        // set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
        // set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
        // set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
        // set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
        // set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
        // set up custom map/reduce configurations
        MRJobConfiguration.setupCustomMRConfigurations(job, config);
        // determine degree of parallelism (nmappers: 1<=n<=capacity)
        // TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        // correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math.max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);
        // set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;
        // set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);
        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }
        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg");
        // set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }
        // set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);
        // configure reducer
        job.setReducerClass(GMRReducer.class);
        // job.setReducerClass(PassThroughReducer.class);
        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }
        // set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);
        runjob = JobClient.runJob(job);
        /* Process different counters */
        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }
        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);
    } finally {
        for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) GMRCombiner(org.apache.sysml.runtime.matrix.mapred.GMRCombiner) FileSystem(org.apache.hadoop.fs.FileSystem) DataGenMRInstruction(org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction) MRInstruction(org.apache.sysml.runtime.instructions.mr.MRInstruction) JobConf(org.apache.hadoop.mapred.JobConf) PrintWriter(java.io.PrintWriter) Path(org.apache.hadoop.fs.Path) DMLConfig(org.apache.sysml.conf.DMLConfig) SeqInstruction(org.apache.sysml.runtime.instructions.mr.SeqInstruction) RandInstruction(org.apache.sysml.runtime.instructions.mr.RandInstruction) MRType(org.apache.sysml.runtime.instructions.mr.MRInstruction.MRType) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixChar_N_ReducerGroups(org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups) RunningJob(org.apache.hadoop.mapred.RunningJob) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 83 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project polyGembler by c-zhou.

the class Anchor1 method run.

// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
    // TODO Auto-generated method stub
    long tic = System.nanoTime();
    sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
    qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
    // this.makeAssemblyGraph(query_file);
    this.makeAssemblyGraph(query_file);
    bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // find 'N/n's in subject/reference sequences
    // which could have impact on parsing the blast records
    final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
    final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
    for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
        String seq_sn = entry.getKey();
        String seq_str = entry.getValue().seq_str();
        final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
        for (int j = 0; j < seq_str.length(); j++) {
            if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
                // blast record is 1-based closed coordination
                tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
        }
        int seq_ln = seq_str.length();
        final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
        for (Range<Integer> range : tmp_rangeSet.asRanges()) {
            int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
            int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
            range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
        }
        sub_gaps.put(seq_sn, range_set);
        // initialise an array list for each reference chromosome
        anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
    }
    // parse blast records
    // blast records buffer
    final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
    // selected blast records
    final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
    // temp list
    final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
    try {
        BufferedReader br_blast = Utils.getBufferedReader(blast_out);
        Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
        Blast6Segment primary_record, secondary_record;
        String qry;
        double qry_ln, aln_frac;
        while (tmp_record != null) {
            qry = tmp_record.qseqid();
            qry_ln = qry_seqs.get(qry).seq_ln();
            buff.clear();
            buff.add(tmp_record);
            while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
                // filter by identity
                if (tmp_record.pident() >= this.min_ident)
                    buff.add(tmp_record);
            }
            sel_recs.clear();
            // merge collinear records
            for (String sub : sub_seqs.keySet()) {
                // get all records for subject/reference sequence sub_seq
                tmp_records.clear();
                for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
                    tmp_records.add(record);
                if (tmp_records.isEmpty())
                    continue;
                // find alignment segments that can be deleted
                // those that are subsets of larger alignment segments
                // (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
                Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
                final Set<int[][]> ranges = new HashSet<int[][]>();
                outerloop: for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    int[][] range = new int[2][2];
                    if (primary_record.sstart() < primary_record.send()) {
                        range[0][0] = primary_record.sstart();
                        range[0][1] = primary_record.send();
                    } else {
                        range[0][0] = primary_record.send();
                        range[0][1] = primary_record.sstart();
                    }
                    if (primary_record.qstart() < primary_record.qend()) {
                        range[1][0] = primary_record.qstart();
                        range[1][1] = primary_record.qend();
                    } else {
                        range[1][0] = primary_record.qend();
                        range[1][1] = primary_record.qstart();
                    }
                    for (int[][] r : ranges) {
                        if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
                            tmp_records.remove(i--);
                            continue outerloop;
                        }
                    }
                    ranges.add(range);
                }
                // TODO rewrite this part
                /**
                 *					// find collinear alignment segments that can be merged
                 *					Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
                 *
                 *					collinear_merged.clear();
                 *					final List<Blast6Record> temp = new ArrayList<Blast6Record>();
                 *					for(int i=0; i<tmp_records.size(); ) {
                 *						Blast6Record record = tmp_records.get(i);
                 *						double max_shift;
                 *						temp.clear();
                 *						temp.add(record);
                 *
                 *						// find collinear alignment segments
                 *						outerloop:
                 *							while( (++i)<tmp_records.size() ) {
                 *								record = tmp_records.get(i);
                 *								// check if is collinear with other alignment segments
                 *								for(Blast6Record r : temp) {
                 *									max_shift = collinear_shift*
                 *											Math.min(r.length(), record.length());
                 *									if(BlastRecord.sdistance(r, record)<=max_shift &&
                 *											BlastRecord.qdistance(r, record)<=max_shift &&
                 *											BlastRecord.pdistance(r, record)<=max_shift) {
                 *										temp.add(record);
                 *										continue outerloop;
                 *									}
                 *								}
                 *								break;
                 *							}
                 *
                 *						// merge collinear alignment segments
                 *						int qstart = Integer.MAX_VALUE;
                 *						int qend = Integer.MIN_VALUE;
                 *						int sstart = Integer.MAX_VALUE;
                 *						int send = Integer.MIN_VALUE;
                 *						double pident = 0;
                 *						int length = 0;
                 *
                 *						// qstart is always smaller than qend
                 *						for(int j=0; j<temp.size(); j++) {
                 *							record = temp.get(j);
                 *							if(record.qstart()<qstart) {
                 *								qstart = record.qstart();
                 *								sstart = record.sstart();
                 *							}
                 *							if(record.qend()>qend) {
                 *								qend = record.qend();
                 *								send = record.send();
                 *							}
                 *							if(record.pident()>pident)
                 *								pident = record.pident();
                 *							length += record.length();
                 *						}
                 *
                 *						collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
                 *					}
                 */
                // find collinear alignment segments that can be merged
                // more accurate but slower
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                Blast6Segment record;
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    for (int j = i + 1; j < tmp_records.size(); j++) {
                        secondary_record = tmp_records.get(j);
                        double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
                        if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
                            tmp_records.set(i, record);
                            tmp_records.remove(j);
                            --i;
                            break;
                        }
                    }
                }
                // process blast records that clipped by gaps
                // (sstart, send)---(start2, send2)
                // (sstart  ...  ---  ...    send2)
                TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    if (sub_gap.contains(primary_record.true_send())) {
                        secondary_record = null;
                        int sec_j = -1;
                        for (int j = i + 1; j < tmp_records.size(); j++) {
                            if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
                                secondary_record = tmp_records.get(j);
                                sec_j = j;
                                break;
                            }
                        }
                        if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
                            // reverse alignment segments
                            continue;
                        }
                        if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
                            // clipping
                            // merge two alignment segments
                            double pident = Math.max(primary_record.pident(), secondary_record.pident());
                            int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
                            int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
                            int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
                            int send = Math.max(primary_record.true_send(), secondary_record.true_send());
                            int length = qend - qstart + 1;
                            // replace primary record with merged record
                            // delete secondary record
                            Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
                            tmp_records.set(i, merged_record);
                            tmp_records.remove(sec_j);
                            // the merged records need to be processed
                            --i;
                        }
                    }
                }
                // add to sel_recs
                sel_recs.addAll(tmp_records);
            }
            // filter by alignment fraction
            buff.clear();
            buff.addAll(sel_recs);
            sel_recs.clear();
            for (Blast6Segment record : buff) {
                if (record.length() / qry_ln >= this.min_frac)
                    sel_recs.add(record);
            }
            if (sel_recs.isEmpty()) {
                // continue
                continue;
            }
            // filter blast records
            Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
            // process primary alignment
            primary_record = sel_recs.get(0);
            anchored_records.get(primary_record.sseqid()).add(primary_record);
            aln_frac = primary_record.length() / qry_ln;
            // and process
            for (int i = 1; i < sel_recs.size(); i++) {
                secondary_record = sel_recs.get(i);
                if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
                    break;
                } else {
                    anchored_records.get(secondary_record.sseqid()).add(secondary_record);
                }
            }
        }
        br_blast.close();
        for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
            System.out.println(entry.getKey() + ": " + entry.getValue().size());
        }
        final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
        final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
        final Set<String> anchored_seqs = new HashSet<String>();
        final List<String> sub_list = Sequence.parseSeqList(subject_file);
        final List<GraphPath<Integer, DefaultWeightedEdge>> paths = new ArrayList<GraphPath<Integer, DefaultWeightedEdge>>();
        for (String sub_sn : sub_list) {
            blast6_records = anchored_records.get(sub_sn);
            int nV = blast6_records.size(), count = 0;
            // sort blast records
            Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
            // consensus
            int posUpto = 0, send_clip = 0;
            int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
            // distance to last 'N', start from next position to find longest common suffix-prefix
            int prev_n = Integer.MAX_VALUE;
            int mol_len = 0;
            int bfs_dist;
            String qseq;
            int nS, nQ;
            // first step: construct super scaffold
            // will form a tree graph indicating the path through the contigs
            // convert contig names to integer indices
            // one contig could end up with multiple indices due to repeats
            Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
            int index = 0;
            for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
            int qseqid1, qseqid2;
            String os;
            List<Double> dist_stats = new ArrayList<Double>();
            for (int i = 0; i != nV; i++) {
                qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
                os = blast6_records.get(i).qseqid() + (blast6_records.get(i).forward() ? "" : "'");
                System.out.print(Utils.fixedLengthPaddingString(os, 40) + "\t");
                double distance = Double.MAX_VALUE;
                for (int j = 1; j <= window_size; j++) {
                    if (i + j >= nV) {
                        System.out.print(Utils.fixedLengthPaddingString("", 10) + ",\t");
                        continue;
                    }
                    qseqid2 = contig_coordinate.getKey(ss_coordinate.get(i + j));
                    GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
                    // GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
                    if (e != null) {
                        double d = e.getLength();
                        if (distance > d)
                            distance = d;
                    }
                    int l = 0;
                    if (e != null) {
                        List<Integer> vertices = e.getVertexList();
                        for (int k = 1; k < vertices.size() - 1; k++) l += qry_seqs.get(contig_coordinate.get(vertices.get(k))).seq_ln();
                        l -= (vertices.size() - 2) * this.kmer_size;
                    }
                    System.out.print((e == null ? Utils.fixedLengthPaddingString("Inf " + l, 10) + ",\t" : Utils.fixedLengthPaddingString(e.getLength() + " " + l, 10) + ",\t"));
                }
                System.out.println();
                dist_stats.add(distance);
            }
            double[] dist_arr = ArrayUtils.toPrimitive(dist_stats.toArray(new Double[dist_stats.size()]));
            Arrays.sort(dist_arr);
            Percentile percentile = new Percentile();
            double q1 = percentile.evaluate(dist_arr, 25);
            double q3 = percentile.evaluate(dist_arr, 75);
            // upper bound for outliers
            // q3+1.5*IQR
            double outlier = q3 + 1.5 * (q3 - q1);
            System.out.println("Outlier upper bound: " + outlier);
            // construct a tree graph
            pseudo_tree = new DirectedWeightedPseudograph<Integer, DefaultWeightedEdge>(DefaultWeightedEdge.class);
            for (int i : ss_coordinate.keySet()) pseudo_tree.addVertex(i);
            Blast6Segment record1, record2;
            // select the nearest neighbour(s)
            for (int i = 0; i != nV; i++) {
                System.out.println(i);
                record1 = blast6_records.get(i);
                send = record1.true_send();
                qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
                paths.clear();
                bfs_dist = Integer.MAX_VALUE;
                for (int j = i + 1; j < nV; j++) {
                    record2 = blast6_records.get(j);
                    sstart = record2.true_sstart();
                    if (send < sstart)
                        break;
                    qseqid2 = contig_coordinate.getKey(ss_coordinate.get(j));
                    GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
                    // GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
                    if (e != null) {
                        tmp_int = e.getLength();
                        if (tmp_int == 0)
                            continue;
                        if (tmp_int < bfs_dist)
                            paths.clear();
                        paths.add(e);
                        bfs_dist = tmp_int;
                    }
                }
                if (paths.isEmpty())
                    continue;
                for (GraphPath<Integer, DefaultWeightedEdge> path : paths) pseudo_tree.setEdgeWeight(pseudo_tree.addEdge(path.getStartVertex(), path.getEndVertex()), 1d);
            }
            // secondly fill gaps
            long toc = System.nanoTime();
            System.out.println(toc - tic + " nano secs.");
            if (true)
                throw new RuntimeException("!!!");
            // second step: construct pseudo chromosomes
            // 1. choose the first contig
            int firstv = 0;
            while (true) {
                // int v2 = next(firstv+1);
                if (true)
                    break;
            }
            // 2. extend pseudo chromosomes
            StringBuilder seq_str = new StringBuilder();
            for (int v = firstv; v < nV - 1; v++) {
                if (++count % 10000 == 0)
                    myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
                Blast6Segment record = blast6_records.get(v);
                qlen = qry_seqs.get(record.qseqid()).seq_ln();
                sstart = record.sstart();
                send = record.send();
                qstart = record.qstart();
                qend = record.qend();
                if (sstart > send) {
                    // make sure sstart<send
                    tmp_int = sstart;
                    sstart = send;
                    send = tmp_int;
                    tmp_int = qstart;
                    qstart = qend;
                    qend = tmp_int;
                }
                if (qstart > qend) {
                    qstart_clip = qlen - qstart;
                    qend_clip = qend - 1;
                    qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
                } else {
                    qstart_clip = qstart - 1;
                    qend_clip = qlen - qend;
                    qseq = qry_seqs.get(record.qseqid()).seq_str();
                }
                if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
                    // TODO process
                    continue;
                }
                // find longest suffix-prefix
                nS = seq_str.length();
                nQ = qseq.length();
                int nO = Math.min(prev_n, Math.min(nS, nQ));
                outerloop: for (; nO >= min_overlap; nO--) {
                    int nS_i = nS - nO;
                    for (int i = 0; i < nO; i++) {
                        if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
                            continue outerloop;
                    }
                    break outerloop;
                }
                if (nO < min_overlap) {
                    // otherwise will insert a large GAP max(pseduo_distance, max_gap)
                    if (posUpto > 0) {
                        if (sstart <= posUpto) {
                            // if too much overlap, then treat it as a contradiction
                            if (posUpto - sstart > min_overlap) {
                                // discard
                                continue;
                            } else {
                                // insert a min_gap
                                gap_size = min_gap;
                            }
                        } else {
                            // estimate gap size
                            gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
                            if (gap_size < max_gap)
                                gap_size = max_gap;
                        }
                        bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
                        seq_str.append(Sequence.polyN(gap_size));
                        mol_len += gap_size;
                    }
                    bw_map.write(record.qseqid() + "\t" + qlen + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write("0\t" + qlen + "\t-\t");
                    } else {
                        // forward
                        bw_map.write("0\t" + qlen + "\t+\t");
                    }
                    seq_str.append(qseq);
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
                    mol_len += qlen;
                    prev_n = qlen;
                    anchored_seqs.add(record.qseqid());
                } else {
                    // overlap found
                    // will not insert gap
                    // ====================
                    // /-------\
                    // /----\
                    // calculate overlaps
                    // process overlap
                    qstart = nO;
                    if (qstart == qlen)
                        continue;
                    bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
                    } else {
                        // forward
                        bw_map.write(qstart + "\t" + qlen + "\t+\t");
                    }
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
                    mol_len += qlen - qstart;
                    prev_n += qlen - qstart;
                    seq_str.append(qseq.substring(qstart));
                    anchored_seqs.add(record.qseqid());
                }
                posUpto = send;
                send_clip = qend_clip;
            }
            if (seq_str.length() > 0)
                bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
        }
        bw_fa.close();
        bw_map.close();
        final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
        for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
            bw_ufa.write(qry_seqs.get(seq).formatOutput());
        bw_ufa.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) TreeRangeSet(com.google.common.collect.TreeRangeSet) HashMap(java.util.HashMap) Blast6Segment(cz1.ngs.model.Blast6Segment) ArrayList(java.util.ArrayList) GraphPath(org.jgrapht.GraphPath) BufferedWriter(java.io.BufferedWriter) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) DefaultWeightedEdge(org.jgrapht.graph.DefaultWeightedEdge) AlignmentSegment(cz1.ngs.model.AlignmentSegment) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) Map(java.util.Map) BidiMap(org.apache.commons.collections4.BidiMap) DualHashBidiMap(org.apache.commons.collections4.bidimap.DualHashBidiMap)

Example 84 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project SeqMonk by s-andrews.

the class MacsPeakCaller method run.

public void run() {
    // for (int i=0;i<selectedChIPStores.length;i++) {
    // System.err.println("Selcted ChIP="+selectedChIPStores[i]);
    // }
    // for (int i=0;i<selectedInputStores.length;i++) {
    // System.err.println("Selcted Input="+selectedInputStores[i]);
    // }
    // First find the tag offsets between the watson and crick strands
    // Work out the total average coverage for all of the combined ChIP samples
    long totalChIPCoverage = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
    }
    if (cancel) {
        generationCancelled();
        return;
    }
    double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();
    double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
    double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;
    System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);
    // Now we go through the data to find locations for our high confidence peaks so we can
    // randomly select 1000 of these to use to find the offset between the two strands
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        if (cancel) {
            generationCancelled();
            return;
        }
        // Time for an update
        updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            long totalLength = 0;
            Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    // Check that the overall density over the region falls within our limits
                    totalLength = 0;
                    for (int s = 0; s < selectedChIPStores.length; s++) {
                        long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                        for (int j = 0; j < reads.length; j++) {
                            totalLength += SequenceRead.length(reads[j]);
                        }
                    }
                    if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                        potentialHighConfidencePeaks.add(lastValidProbe);
                    }
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            long totalLength = 0;
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                potentialHighConfidencePeaks.add(lastValidProbe);
            }
        }
    }
    if (potentialHighConfidencePeaks.size() == 0) {
        JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
        generationCancelled();
        return;
    }
    // System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");
    // Now we select 1000 random probes from this set
    Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);
    Collections.shuffle(Arrays.asList(highConfidencePeaks));
    Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];
    for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
        randomHighConfidenceProbes[i] = highConfidencePeaks[i];
    }
    // Now find the average distance between forward / reverse reads in the candidate peaks
    int[] distances = new int[highConfidencePeaks.length];
    // Sort the candidates so we don't do stupid stuff with the cache
    Arrays.sort(randomHighConfidenceProbes);
    for (int p = 0; p < randomHighConfidenceProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
    }
    int medianInterStrandDistance = (int) SimpleStats.median(distances);
    if (medianInterStrandDistance < 0)
        medianInterStrandDistance = 0;
    // System.err.println("Median inter strand difference = "+medianInterStrandDistance);
    // Now we find the depth cutoff for overrepresented single tags using a binomial distribution
    int totalReadCount = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalReadCount += selectedChIPStores[i].getTotalReadCount();
    }
    BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength());
    // We want to know what depth has a chance of less than 1^-5
    int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);
    if (redundantThreshold < 1)
        redundantThreshold = 1;
    // System.err.println("Redundancy threshold is "+redundantThreshold);
    // Now we construct a poisson distribution to work out the threshold to use for
    // constructing a full candidate peak set.
    updateGenerationProgress("Counting non-redundant reads", 0, 1);
    // To do this we need to get the full non-redundant length from the whole set
    int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);
    // System.err.println("Total non-redundant sequences is "+totalNonRedCount);
    // We need to know the median read length for the data
    int readLength = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
    }
    readLength /= selectedChIPStores.length;
    double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength);
    PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);
    int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);
    // System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));
    // Now we go back through the whole dataset to do a search for all possible candidate probes
    // We re-use the peak vector we came up with before.
    potentialHighConfidencePeaks.clear();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            // We expand the region we're looking at by the inter-strand distance as we're going to
            // be adjusting the read positions
            Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));
            long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance);
            mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
            SequenceRead.sort(mergedProbeReads);
            int thisProbeOverlapCount = 0;
            for (int i = 0; i < mergedProbeReads.length; i++) {
                if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
                    ++thisProbeOverlapCount;
                }
            }
            if (thisProbeOverlapCount > readCountCutoff) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    potentialHighConfidencePeaks.add(lastValidProbe);
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            potentialHighConfidencePeaks.add(lastValidProbe);
        }
    }
    // Finally we re-filter the peaks we have using local poisson distributions with densities taken
    // from either the input samples (if there are any), or the local region.  The densities are
    // estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
    // If there is no input then the 1kb region is not used.
    Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);
    // Work out which stores we're using to validate against.
    DataStore[] validationStores;
    boolean useInput = false;
    double inputCorrection = 1;
    int validationNonRedCount;
    if (selectedInputStores != null && selectedInputStores.length > 0) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        validationStores = selectedInputStores;
        useInput = true;
        // We also need to work out the total number of nonredundant seqences
        // in the input so we can work out a scaling factor so that the densities
        // for input and chip are comparable.
        validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);
        inputCorrection = totalNonRedCount / (double) validationNonRedCount;
        System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection);
    } else {
        validationStores = selectedChIPStores;
        validationNonRedCount = totalNonRedCount;
    }
    Vector<Probe> finalValidatedProbes = new Vector<Probe>();
    for (int p = 0; p < allCandidateProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        if (p % 100 == 0) {
            updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length);
        }
        // System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());
        // We now need to find the maximum read density per 2*bandwidth against which
        // we're going to validate this peak
        // We're going to get all reads within 10kb of the peak, and then we can subselect from there
        int midPoint = allCandidateProbes[p].middle();
        Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        // Get the probes for the largest region
        long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);
        // Deduplicate so it's a fair comparison
        // Should we recalculate the redundant threshold based on the input coverage?
        thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold);
        int region10kbcount = thisRegionReads.length;
        int region5kbcount = 0;
        int region1kbcount = 0;
        // Go through the reads seeing if they fit into the 5 or 1kb regions
        for (int r = 0; r < thisRegionReads.length; r++) {
            if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
                ++region5kbcount;
            if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
                ++region1kbcount;
        }
        // System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);
        // Convert to densities per window and ajdust for global coverage
        double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        // Find the highest density to use for the validation
        double highestDensity = globalDensity;
        if (density10kb > highestDensity)
            highestDensity = density10kb;
        if (density5kb > highestDensity)
            highestDensity = density5kb;
        if (useInput && density1kb > highestDensity)
            highestDensity = density1kb;
        // System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);
        // Construct a poisson distribution with this density
        PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);
        // System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
        // Now check to see if the actual count from this peak is enough to still pass
        long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance);
        mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
        SequenceRead.sort(mergedProbeReads);
        int thisProbeOverlapCount = 0;
        for (int i = 0; i < mergedProbeReads.length; i++) {
            if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
                ++thisProbeOverlapCount;
            }
        }
        if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
            finalValidatedProbes.add(allCandidateProbes[p]);
        // System.err.println("Adding probe to final set");
        }
    }
    // System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");
    ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));
    generationComplete(finalSet);
}
Also used : PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) BinomialDistribution(org.apache.commons.math3.distribution.BinomialDistribution) Vector(java.util.Vector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector)

Example 85 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project SeqMonk by s-andrews.

the class AntisenseTranscriptionPipeline method startPipeline.

protected void startPipeline() {
    // We first need to generate probes over all of the features listed in
    // the feature types.  The probes should cover the whole area of the
    // feature regardless of where it splices.
    Vector<Probe> probes = new Vector<Probe>();
    double pValue = optionsPanel.pValue();
    QuantitationStrandType readFilter = optionsPanel.readFilter();
    long[] senseCounts = new long[data.length];
    long[] antisenseCounts = new long[data.length];
    Chromosome[] chrs = collection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Getting total antisense rate for chr" + chrs[c].name(), c, chrs.length * 2);
        Feature[] features = getValidFeatures(chrs[c]);
        for (int f = 0; f < features.length; f++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            Probe p = new Probe(chrs[c], features[f].location().start(), features[f].location().end(), features[f].location().strand(), features[f].name());
            probes.add(p);
            for (int d = 0; d < data.length; d++) {
                long[] reads = data[d].getReadsForProbe(p);
                for (int r = 0; r < reads.length; r++) {
                    if (readFilter.useRead(p, reads[r])) {
                        senseCounts[d] += SequenceRead.length(reads[r]);
                    } else {
                        antisenseCounts[d] += SequenceRead.length(reads[r]);
                    }
                }
            }
        }
    }
    Probe[] allProbes = probes.toArray(new Probe[0]);
    collection().setProbeSet(new ProbeSet("Features over " + optionsPanel.getSelectedFeatureType(), allProbes));
    // Now we can work out the overall antisense rate
    double[] antisenseProbability = new double[data.length];
    for (int d = 0; d < data.length; d++) {
        System.err.println("Antisense counts are " + antisenseCounts[d] + " sense counts are " + senseCounts[d]);
        antisenseProbability[d] = antisenseCounts[d] / (double) (antisenseCounts[d] + senseCounts[d]);
        System.err.println("Antisense probability for " + data[d].name() + " is " + antisenseProbability[d]);
    }
    // Now we can quantitate each individual feature and test for whether it is significantly
    // showing antisense expression
    ArrayList<Vector<ProbeTTestValue>> significantProbes = new ArrayList<Vector<ProbeTTestValue>>();
    for (int d = 0; d < data.length; d++) {
        significantProbes.add(new Vector<ProbeTTestValue>());
    }
    int[] readLengths = new int[data.length];
    for (int d = 0; d < readLengths.length; d++) {
        readLengths[d] = data[d].getMaxReadLength();
        System.err.println("For " + data[d].name() + " max read len is " + readLengths[d]);
    }
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
        Probe[] thisChrProbes = collection().probeSet().getProbesForChromosome(chrs[c]);
        for (int p = 0; p < thisChrProbes.length; p++) {
            for (int d = 0; d < data.length; d++) {
                if (cancel) {
                    progressCancelled();
                    return;
                }
                long senseCount = 0;
                long antisenseCount = 0;
                long[] reads = data[d].getReadsForProbe(thisChrProbes[p]);
                for (int r = 0; r < reads.length; r++) {
                    if (readFilter.useRead(thisChrProbes[p], reads[r])) {
                        // TODO: Just count overlap?
                        senseCount += SequenceRead.length(reads[r]);
                    } else {
                        antisenseCount += SequenceRead.length(reads[r]);
                    }
                }
                // if (thisChrProbes[p].name().equals("RP4-798A10.2")) {
                // System.err.println("Raw base counts are sense="+senseCount+" anti="+antisenseCount+" from "+reads.length+" reads");
                // }
                int senseReads = (int) (senseCount / readLengths[d]);
                int antisenseReads = (int) (antisenseCount / readLengths[d]);
                // if (thisChrProbes[p].name().equals("RP4-798A10.2")) {
                // System.err.println("Raw read counts are sense="+senseReads+" anti="+antisenseReads+" from "+reads.length+" reads");
                // }
                BinomialDistribution bd = new BinomialDistribution(senseReads + antisenseReads, antisenseProbability[d]);
                // Since the binomial distribution gives the probability of getting a value higher than
                // this we need to subtract one so we get the probability of this or higher.
                double thisPValue = 1 - bd.cumulativeProbability(antisenseReads - 1);
                if (antisenseReads == 0)
                    thisPValue = 1;
                // We have to add all results at this stage so we don't mess up the multiple
                // testing correction later on.
                significantProbes.get(d).add(new ProbeTTestValue(thisChrProbes[p], thisPValue));
                double expected = ((senseReads + antisenseReads) * antisenseProbability[d]);
                if (expected < 1)
                    expected = 1;
                float obsExp = antisenseReads / (float) expected;
                data[d].setValueForProbe(thisChrProbes[p], obsExp);
            }
        }
    }
    // filtering those which pass our p-value cutoff
    for (int d = 0; d < data.length; d++) {
        ProbeTTestValue[] ttestResults = significantProbes.get(d).toArray(new ProbeTTestValue[0]);
        BenjHochFDR.calculateQValues(ttestResults);
        ProbeList newList = new ProbeList(collection().probeSet(), "Antisense < " + pValue + " in " + data[d].name(), "Probes showing significant antisense transcription from a basal level of " + antisenseProbability[d] + " with a cutoff of " + pValue, "FDR");
        for (int i = 0; i < ttestResults.length; i++) {
            if (ttestResults[i].probe.name().equals("RP4-798A10.2")) {
                System.err.println("Raw p=" + ttestResults[i].p + " q=" + ttestResults[i].q);
            }
            if (ttestResults[i].q < pValue) {
                newList.addProbe(ttestResults[i].probe, (float) ttestResults[i].q);
            }
        }
    }
    StringBuffer quantitationDescription = new StringBuffer();
    quantitationDescription.append("Antisense transcription pipeline quantitation ");
    quantitationDescription.append(". Directionality was ");
    quantitationDescription.append(optionsPanel.libraryTypeBox.getSelectedItem());
    if (optionsPanel.ignoreOverlaps()) {
        quantitationDescription.append(". Ignoring existing overlaps");
    }
    quantitationDescription.append(". P-value cutoff was ");
    quantitationDescription.append(optionsPanel.pValue());
    collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
    quantitatonComplete();
}
Also used : ArrayList(java.util.ArrayList) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) ProbeTTestValue(uk.ac.babraham.SeqMonk.Analysis.Statistics.ProbeTTestValue) QuantitationStrandType(uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType) Vector(java.util.Vector) ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) BinomialDistribution(org.apache.commons.math3.distribution.BinomialDistribution)

Aggregations

ArrayList (java.util.ArrayList)26 List (java.util.List)19 Collectors (java.util.stream.Collectors)13 DescriptiveStatistics (org.apache.commons.math3.stat.descriptive.DescriptiveStatistics)13 Arrays (java.util.Arrays)11 Map (java.util.Map)11 IntStream (java.util.stream.IntStream)10 RandomGenerator (org.apache.commons.math3.random.RandomGenerator)10 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)9 RealMatrix (org.apache.commons.math3.linear.RealMatrix)9 Plot2 (ij.gui.Plot2)8 File (java.io.File)8 IOException (java.io.IOException)8 TooManyEvaluationsException (org.apache.commons.math3.exception.TooManyEvaluationsException)7 Test (org.testng.annotations.Test)7 StoredDataStatistics (gdsc.core.utils.StoredDataStatistics)6 Collections (java.util.Collections)6 HashMap (java.util.HashMap)6 Random (java.util.Random)6 UnivariateFunction (org.apache.commons.math3.analysis.UnivariateFunction)6