use of org.apache.commons.math3.stat.descriptive.rank.Max in project chordatlas by twak.
the class SkelFootprint method findRoofColor.
private static void findRoofColor(HalfMesh2 mesh) {
class Wrapper implements Clusterable {
double[] col;
public Wrapper(float[] col) {
this.col = new double[] { col[0], col[1], col[2] };
}
@Override
public double[] getPoint() {
return col;
}
}
for (HalfFace hf : mesh.faces) {
List<Wrapper> toCluster = new ArrayList();
SuperFace sf = (SuperFace) hf;
if (sf.colors == null || sf.colors.isEmpty()) {
float grey = (float) (Math.random() * 0.3 + 0.2);
sf.roofColor = new float[] { grey, grey, grey };
continue;
}
for (float[] v : sf.colors) toCluster.add(new Wrapper(v));
sf.colors = null;
DBSCANClusterer<Wrapper> cr = new DBSCANClusterer<>(0.2, 5);
List<Cluster<Wrapper>> results = cr.cluster(toCluster);
float[] col = new float[] { 0.3f, 0.3f, 0.3f };
try {
Cluster<Wrapper> biggest = results.stream().max((a, b) -> Double.compare(a.getPoints().size(), b.getPoints().size())).get();
col = new float[3];
for (Wrapper w : biggest.getPoints()) for (int i = 0; i < 3; i++) col[i] += (float) w.col[i];
int size = biggest.getPoints().size();
for (int i = 0; i < 3; i++) col[i] /= size;
} catch (NoSuchElementException e) {
}
sf.roofColor = col;
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Max in project systemml by apache.
the class DataGenMR method runJob.
/**
* <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
*
* @param inst MR job instruction
* @param dataGenInstructions array of data gen instructions
* @param instructionsInMapper instructions in mapper
* @param aggInstructionsInReducer aggregate instructions in reducer
* @param otherInstructionsInReducer other instructions in reducer
* @param numReducers number of reducers
* @param replication file replication
* @param resultIndexes result indexes for each random object
* @param dimsUnknownFilePrefix file path prefix when dimensions unknown
* @param outputs output file for each random object
* @param outputInfos output information for each random object
* @return matrix characteristics for each random object
* @throws Exception if Exception occurs
*/
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(DataGenMR.class);
job.setJobName("DataGen-MR");
// whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, true);
byte[] realIndexes = new byte[dataGenInstructions.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
String[] inputs = new String[dataGenInstructions.length];
InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
long[] rlens = new long[dataGenInstructions.length];
long[] clens = new long[dataGenInstructions.length];
int[] brlens = new int[dataGenInstructions.length];
int[] bclens = new int[dataGenInstructions.length];
FileSystem fs = FileSystem.get(job);
String dataGenInsStr = "";
int numblocks = 0;
int maxbrlen = -1, maxbclen = -1;
double maxsparsity = -1;
for (int i = 0; i < dataGenInstructions.length; i++) {
dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];
MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
MRType mrtype = mrins.getMRInstructionType();
DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;
rlens[i] = genInst.getRows();
clens[i] = genInst.getCols();
brlens[i] = genInst.getRowsInBlock();
bclens[i] = genInst.getColsInBlock();
maxbrlen = Math.max(maxbrlen, brlens[i]);
maxbclen = Math.max(maxbclen, bclens[i]);
if (mrtype == MRType.Rand) {
RandInstruction randInst = (RandInstruction) mrins;
inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
maxsparsity = Math.max(maxsparsity, randInst.getSparsity());
PrintWriter pw = null;
try {
pw = new PrintWriter(fs.create(new Path(inputs[i])));
// for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
// seed generation
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
for (long r = 0; r < Math.max(rlens[i], 1); r += brlens[i]) {
long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
for (long c = 0; c < Math.max(clens[i], 1); c += bclens[i]) {
long curBlockColSize = Math.min(bclens[i], (clens[i] - c));
sb.append((r / brlens[i]) + 1);
sb.append(',');
sb.append((c / bclens[i]) + 1);
sb.append(',');
sb.append(curBlockRowSize);
sb.append(',');
sb.append(curBlockColSize);
sb.append(',');
sb.append(bigrand.nextLong());
pw.println(sb.toString());
sb.setLength(0);
numblocks++;
}
}
} finally {
IOUtilFunctions.closeSilently(pw);
}
inputInfos[i] = InputInfo.TextCellInputInfo;
} else if (mrtype == MRType.Seq) {
SeqInstruction seqInst = (SeqInstruction) mrins;
inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
// always dense
maxsparsity = 1.0;
double from = seqInst.fromValue;
double to = seqInst.toValue;
double incr = seqInst.incrValue;
// handle default 1 to -1 for special case of from>to
incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
// Correctness checks on (from, to, incr)
boolean neg = (from > to);
if (incr == 0)
throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
if (neg != (incr < 0))
throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
// Compute the number of rows in the sequence
long numrows = UtilFunctions.getSeqLength(from, to, incr);
if (rlens[i] > 0) {
if (numrows != rlens[i])
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
} else {
rlens[i] = numrows;
}
if (clens[i] > 0 && clens[i] != 1)
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
else
clens[i] = 1;
PrintWriter pw = null;
try {
pw = new PrintWriter(fs.create(new Path(inputs[i])));
StringBuilder sb = new StringBuilder();
double temp = from;
double block_from, block_to;
for (long r = 0; r < rlens[i]; r += brlens[i]) {
long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
// block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval)
long bid_i = ((r / brlens[i]) + 1);
long bid_j = 1;
block_from = temp;
block_to = temp + (curBlockRowSize - 1) * incr;
// next block starts from here
temp = block_to + incr;
sb.append(bid_i);
sb.append(',');
sb.append(bid_j);
sb.append(',');
sb.append(block_from);
sb.append(',');
sb.append(block_to);
sb.append(',');
sb.append(incr);
pw.println(sb.toString());
sb.setLength(0);
numblocks++;
}
} finally {
IOUtilFunctions.closeSilently(pw);
}
inputInfos[i] = InputInfo.TextCellInputInfo;
} else {
throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
}
}
// remove the first ","
dataGenInsStr = dataGenInsStr.substring(1);
RunningJob runjob;
MatrixCharacteristics[] stats;
try {
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
// set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
// set up the rand Instructions
MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
// set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
// set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
// set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
// set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// determine degree of parallelism (nmappers: 1<=n<=capacity)
// TODO use maxsparsity whenever we have a way of generating sparse rand data
int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
// correction max number of mappers on yarn clusters
if (InfrastructureAnalyzer.isYarnEnabled())
capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
int nmapers = Math.max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
job.setNumMapTasks(nmapers);
// set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
stats = ret.stats;
// set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// print the complete MRJob instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] resultDimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
resultDimsUnknown[i] = (byte) 1;
} else {
resultDimsUnknown[i] = (byte) 0;
}
}
boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg");
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
// configure mapper and the mapper output key value pairs
job.setMapperClass(DataGenMapper.class);
if (numReducers == 0) {
job.setMapOutputKeyClass(Writable.class);
job.setMapOutputValueClass(Writable.class);
} else {
job.setMapOutputKeyClass(MatrixIndexes.class);
job.setMapOutputValueClass(TaggedMatrixBlock.class);
}
// set up combiner
if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
job.setCombinerClass(GMRCombiner.class);
// configure reducer
job.setReducerClass(GMRReducer.class);
// job.setReducerClass(PassThroughReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
stats = MapReduceTool.processDimsFiles(dir, stats);
MapReduceTool.deleteFileIfExistOnHDFS(dir);
} finally {
for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.commons.math3.stat.descriptive.rank.Max in project polyGembler by c-zhou.
the class Anchor1 method run.
// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
// TODO Auto-generated method stub
long tic = System.nanoTime();
sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
// this.makeAssemblyGraph(query_file);
this.makeAssemblyGraph(query_file);
bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
// dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
// find 'N/n's in subject/reference sequences
// which could have impact on parsing the blast records
final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
String seq_sn = entry.getKey();
String seq_str = entry.getValue().seq_str();
final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
for (int j = 0; j < seq_str.length(); j++) {
if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
// blast record is 1-based closed coordination
tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
}
int seq_ln = seq_str.length();
final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
for (Range<Integer> range : tmp_rangeSet.asRanges()) {
int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
}
sub_gaps.put(seq_sn, range_set);
// initialise an array list for each reference chromosome
anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
}
// parse blast records
// blast records buffer
final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
// selected blast records
final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
// temp list
final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
try {
BufferedReader br_blast = Utils.getBufferedReader(blast_out);
Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
Blast6Segment primary_record, secondary_record;
String qry;
double qry_ln, aln_frac;
while (tmp_record != null) {
qry = tmp_record.qseqid();
qry_ln = qry_seqs.get(qry).seq_ln();
buff.clear();
buff.add(tmp_record);
while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
// filter by identity
if (tmp_record.pident() >= this.min_ident)
buff.add(tmp_record);
}
sel_recs.clear();
// merge collinear records
for (String sub : sub_seqs.keySet()) {
// get all records for subject/reference sequence sub_seq
tmp_records.clear();
for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
tmp_records.add(record);
if (tmp_records.isEmpty())
continue;
// find alignment segments that can be deleted
// those that are subsets of larger alignment segments
// (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
final Set<int[][]> ranges = new HashSet<int[][]>();
outerloop: for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
int[][] range = new int[2][2];
if (primary_record.sstart() < primary_record.send()) {
range[0][0] = primary_record.sstart();
range[0][1] = primary_record.send();
} else {
range[0][0] = primary_record.send();
range[0][1] = primary_record.sstart();
}
if (primary_record.qstart() < primary_record.qend()) {
range[1][0] = primary_record.qstart();
range[1][1] = primary_record.qend();
} else {
range[1][0] = primary_record.qend();
range[1][1] = primary_record.qstart();
}
for (int[][] r : ranges) {
if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
tmp_records.remove(i--);
continue outerloop;
}
}
ranges.add(range);
}
// TODO rewrite this part
/**
* // find collinear alignment segments that can be merged
* Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
*
* collinear_merged.clear();
* final List<Blast6Record> temp = new ArrayList<Blast6Record>();
* for(int i=0; i<tmp_records.size(); ) {
* Blast6Record record = tmp_records.get(i);
* double max_shift;
* temp.clear();
* temp.add(record);
*
* // find collinear alignment segments
* outerloop:
* while( (++i)<tmp_records.size() ) {
* record = tmp_records.get(i);
* // check if is collinear with other alignment segments
* for(Blast6Record r : temp) {
* max_shift = collinear_shift*
* Math.min(r.length(), record.length());
* if(BlastRecord.sdistance(r, record)<=max_shift &&
* BlastRecord.qdistance(r, record)<=max_shift &&
* BlastRecord.pdistance(r, record)<=max_shift) {
* temp.add(record);
* continue outerloop;
* }
* }
* break;
* }
*
* // merge collinear alignment segments
* int qstart = Integer.MAX_VALUE;
* int qend = Integer.MIN_VALUE;
* int sstart = Integer.MAX_VALUE;
* int send = Integer.MIN_VALUE;
* double pident = 0;
* int length = 0;
*
* // qstart is always smaller than qend
* for(int j=0; j<temp.size(); j++) {
* record = temp.get(j);
* if(record.qstart()<qstart) {
* qstart = record.qstart();
* sstart = record.sstart();
* }
* if(record.qend()>qend) {
* qend = record.qend();
* send = record.send();
* }
* if(record.pident()>pident)
* pident = record.pident();
* length += record.length();
* }
*
* collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
* }
*/
// find collinear alignment segments that can be merged
// more accurate but slower
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
Blast6Segment record;
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
for (int j = i + 1; j < tmp_records.size(); j++) {
secondary_record = tmp_records.get(j);
double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
tmp_records.set(i, record);
tmp_records.remove(j);
--i;
break;
}
}
}
// process blast records that clipped by gaps
// (sstart, send)---(start2, send2)
// (sstart ... --- ... send2)
TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
if (sub_gap.contains(primary_record.true_send())) {
secondary_record = null;
int sec_j = -1;
for (int j = i + 1; j < tmp_records.size(); j++) {
if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
secondary_record = tmp_records.get(j);
sec_j = j;
break;
}
}
if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
// reverse alignment segments
continue;
}
if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
// clipping
// merge two alignment segments
double pident = Math.max(primary_record.pident(), secondary_record.pident());
int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
int send = Math.max(primary_record.true_send(), secondary_record.true_send());
int length = qend - qstart + 1;
// replace primary record with merged record
// delete secondary record
Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
tmp_records.set(i, merged_record);
tmp_records.remove(sec_j);
// the merged records need to be processed
--i;
}
}
}
// add to sel_recs
sel_recs.addAll(tmp_records);
}
// filter by alignment fraction
buff.clear();
buff.addAll(sel_recs);
sel_recs.clear();
for (Blast6Segment record : buff) {
if (record.length() / qry_ln >= this.min_frac)
sel_recs.add(record);
}
if (sel_recs.isEmpty()) {
// continue
continue;
}
// filter blast records
Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
// process primary alignment
primary_record = sel_recs.get(0);
anchored_records.get(primary_record.sseqid()).add(primary_record);
aln_frac = primary_record.length() / qry_ln;
// and process
for (int i = 1; i < sel_recs.size(); i++) {
secondary_record = sel_recs.get(i);
if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
break;
} else {
anchored_records.get(secondary_record.sseqid()).add(secondary_record);
}
}
}
br_blast.close();
for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue().size());
}
final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
final Set<String> anchored_seqs = new HashSet<String>();
final List<String> sub_list = Sequence.parseSeqList(subject_file);
final List<GraphPath<Integer, DefaultWeightedEdge>> paths = new ArrayList<GraphPath<Integer, DefaultWeightedEdge>>();
for (String sub_sn : sub_list) {
blast6_records = anchored_records.get(sub_sn);
int nV = blast6_records.size(), count = 0;
// sort blast records
Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
// consensus
int posUpto = 0, send_clip = 0;
int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
// distance to last 'N', start from next position to find longest common suffix-prefix
int prev_n = Integer.MAX_VALUE;
int mol_len = 0;
int bfs_dist;
String qseq;
int nS, nQ;
// first step: construct super scaffold
// will form a tree graph indicating the path through the contigs
// convert contig names to integer indices
// one contig could end up with multiple indices due to repeats
Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
int index = 0;
for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
int qseqid1, qseqid2;
String os;
List<Double> dist_stats = new ArrayList<Double>();
for (int i = 0; i != nV; i++) {
qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
os = blast6_records.get(i).qseqid() + (blast6_records.get(i).forward() ? "" : "'");
System.out.print(Utils.fixedLengthPaddingString(os, 40) + "\t");
double distance = Double.MAX_VALUE;
for (int j = 1; j <= window_size; j++) {
if (i + j >= nV) {
System.out.print(Utils.fixedLengthPaddingString("", 10) + ",\t");
continue;
}
qseqid2 = contig_coordinate.getKey(ss_coordinate.get(i + j));
GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
// GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
if (e != null) {
double d = e.getLength();
if (distance > d)
distance = d;
}
int l = 0;
if (e != null) {
List<Integer> vertices = e.getVertexList();
for (int k = 1; k < vertices.size() - 1; k++) l += qry_seqs.get(contig_coordinate.get(vertices.get(k))).seq_ln();
l -= (vertices.size() - 2) * this.kmer_size;
}
System.out.print((e == null ? Utils.fixedLengthPaddingString("Inf " + l, 10) + ",\t" : Utils.fixedLengthPaddingString(e.getLength() + " " + l, 10) + ",\t"));
}
System.out.println();
dist_stats.add(distance);
}
double[] dist_arr = ArrayUtils.toPrimitive(dist_stats.toArray(new Double[dist_stats.size()]));
Arrays.sort(dist_arr);
Percentile percentile = new Percentile();
double q1 = percentile.evaluate(dist_arr, 25);
double q3 = percentile.evaluate(dist_arr, 75);
// upper bound for outliers
// q3+1.5*IQR
double outlier = q3 + 1.5 * (q3 - q1);
System.out.println("Outlier upper bound: " + outlier);
// construct a tree graph
pseudo_tree = new DirectedWeightedPseudograph<Integer, DefaultWeightedEdge>(DefaultWeightedEdge.class);
for (int i : ss_coordinate.keySet()) pseudo_tree.addVertex(i);
Blast6Segment record1, record2;
// select the nearest neighbour(s)
for (int i = 0; i != nV; i++) {
System.out.println(i);
record1 = blast6_records.get(i);
send = record1.true_send();
qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
paths.clear();
bfs_dist = Integer.MAX_VALUE;
for (int j = i + 1; j < nV; j++) {
record2 = blast6_records.get(j);
sstart = record2.true_sstart();
if (send < sstart)
break;
qseqid2 = contig_coordinate.getKey(ss_coordinate.get(j));
GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
// GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
if (e != null) {
tmp_int = e.getLength();
if (tmp_int == 0)
continue;
if (tmp_int < bfs_dist)
paths.clear();
paths.add(e);
bfs_dist = tmp_int;
}
}
if (paths.isEmpty())
continue;
for (GraphPath<Integer, DefaultWeightedEdge> path : paths) pseudo_tree.setEdgeWeight(pseudo_tree.addEdge(path.getStartVertex(), path.getEndVertex()), 1d);
}
// secondly fill gaps
long toc = System.nanoTime();
System.out.println(toc - tic + " nano secs.");
if (true)
throw new RuntimeException("!!!");
// second step: construct pseudo chromosomes
// 1. choose the first contig
int firstv = 0;
while (true) {
// int v2 = next(firstv+1);
if (true)
break;
}
// 2. extend pseudo chromosomes
StringBuilder seq_str = new StringBuilder();
for (int v = firstv; v < nV - 1; v++) {
if (++count % 10000 == 0)
myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
Blast6Segment record = blast6_records.get(v);
qlen = qry_seqs.get(record.qseqid()).seq_ln();
sstart = record.sstart();
send = record.send();
qstart = record.qstart();
qend = record.qend();
if (sstart > send) {
// make sure sstart<send
tmp_int = sstart;
sstart = send;
send = tmp_int;
tmp_int = qstart;
qstart = qend;
qend = tmp_int;
}
if (qstart > qend) {
qstart_clip = qlen - qstart;
qend_clip = qend - 1;
qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
} else {
qstart_clip = qstart - 1;
qend_clip = qlen - qend;
qseq = qry_seqs.get(record.qseqid()).seq_str();
}
if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
// TODO process
continue;
}
// find longest suffix-prefix
nS = seq_str.length();
nQ = qseq.length();
int nO = Math.min(prev_n, Math.min(nS, nQ));
outerloop: for (; nO >= min_overlap; nO--) {
int nS_i = nS - nO;
for (int i = 0; i < nO; i++) {
if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
continue outerloop;
}
break outerloop;
}
if (nO < min_overlap) {
// otherwise will insert a large GAP max(pseduo_distance, max_gap)
if (posUpto > 0) {
if (sstart <= posUpto) {
// if too much overlap, then treat it as a contradiction
if (posUpto - sstart > min_overlap) {
// discard
continue;
} else {
// insert a min_gap
gap_size = min_gap;
}
} else {
// estimate gap size
gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
if (gap_size < max_gap)
gap_size = max_gap;
}
bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
seq_str.append(Sequence.polyN(gap_size));
mol_len += gap_size;
}
bw_map.write(record.qseqid() + "\t" + qlen + "\t");
if (qstart > qend) {
// reverse
bw_map.write("0\t" + qlen + "\t-\t");
} else {
// forward
bw_map.write("0\t" + qlen + "\t+\t");
}
seq_str.append(qseq);
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
mol_len += qlen;
prev_n = qlen;
anchored_seqs.add(record.qseqid());
} else {
// overlap found
// will not insert gap
// ====================
// /-------\
// /----\
// calculate overlaps
// process overlap
qstart = nO;
if (qstart == qlen)
continue;
bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
if (qstart > qend) {
// reverse
bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
} else {
// forward
bw_map.write(qstart + "\t" + qlen + "\t+\t");
}
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
mol_len += qlen - qstart;
prev_n += qlen - qstart;
seq_str.append(qseq.substring(qstart));
anchored_seqs.add(record.qseqid());
}
posUpto = send;
send_clip = qend_clip;
}
if (seq_str.length() > 0)
bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
}
bw_fa.close();
bw_map.close();
final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
bw_ufa.write(qry_seqs.get(seq).formatOutput());
bw_ufa.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of org.apache.commons.math3.stat.descriptive.rank.Max in project SeqMonk by s-andrews.
the class MacsPeakCaller method run.
public void run() {
// for (int i=0;i<selectedChIPStores.length;i++) {
// System.err.println("Selcted ChIP="+selectedChIPStores[i]);
// }
// for (int i=0;i<selectedInputStores.length;i++) {
// System.err.println("Selcted Input="+selectedInputStores[i]);
// }
// First find the tag offsets between the watson and crick strands
// Work out the total average coverage for all of the combined ChIP samples
long totalChIPCoverage = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
}
if (cancel) {
generationCancelled();
return;
}
double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();
double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;
System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);
// Now we go through the data to find locations for our high confidence peaks so we can
// randomly select 1000 of these to use to find the offset between the two strands
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
if (cancel) {
generationCancelled();
return;
}
// Time for an update
updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
Probe lastValidProbe = null;
for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
long totalLength = 0;
Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) {
if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
} else if (lastValidProbe != null) {
// Check that the overall density over the region falls within our limits
totalLength = 0;
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
lastValidProbe = probe;
} else {
lastValidProbe = probe;
}
}
}
if (lastValidProbe != null) {
long totalLength = 0;
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
}
}
if (potentialHighConfidencePeaks.size() == 0) {
JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
generationCancelled();
return;
}
// System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");
// Now we select 1000 random probes from this set
Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);
Collections.shuffle(Arrays.asList(highConfidencePeaks));
Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];
for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
randomHighConfidenceProbes[i] = highConfidencePeaks[i];
}
// Now find the average distance between forward / reverse reads in the candidate peaks
int[] distances = new int[highConfidencePeaks.length];
// Sort the candidates so we don't do stupid stuff with the cache
Arrays.sort(randomHighConfidenceProbes);
for (int p = 0; p < randomHighConfidenceProbes.length; p++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
}
int medianInterStrandDistance = (int) SimpleStats.median(distances);
if (medianInterStrandDistance < 0)
medianInterStrandDistance = 0;
// System.err.println("Median inter strand difference = "+medianInterStrandDistance);
// Now we find the depth cutoff for overrepresented single tags using a binomial distribution
int totalReadCount = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
totalReadCount += selectedChIPStores[i].getTotalReadCount();
}
BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength());
// We want to know what depth has a chance of less than 1^-5
int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);
if (redundantThreshold < 1)
redundantThreshold = 1;
// System.err.println("Redundancy threshold is "+redundantThreshold);
// Now we construct a poisson distribution to work out the threshold to use for
// constructing a full candidate peak set.
updateGenerationProgress("Counting non-redundant reads", 0, 1);
// To do this we need to get the full non-redundant length from the whole set
int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);
// System.err.println("Total non-redundant sequences is "+totalNonRedCount);
// We need to know the median read length for the data
int readLength = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
}
readLength /= selectedChIPStores.length;
double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength);
PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);
int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);
// System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));
// Now we go back through the whole dataset to do a search for all possible candidate probes
// We re-use the peak vector we came up with before.
potentialHighConfidencePeaks.clear();
for (int c = 0; c < chromosomes.length; c++) {
// Time for an update
updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
Probe lastValidProbe = null;
for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
// We expand the region we're looking at by the inter-strand distance as we're going to
// be adjusting the read positions
Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));
long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance);
mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
SequenceRead.sort(mergedProbeReads);
int thisProbeOverlapCount = 0;
for (int i = 0; i < mergedProbeReads.length; i++) {
if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
++thisProbeOverlapCount;
}
}
if (thisProbeOverlapCount > readCountCutoff) {
if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
} else if (lastValidProbe != null) {
potentialHighConfidencePeaks.add(lastValidProbe);
lastValidProbe = probe;
} else {
lastValidProbe = probe;
}
}
}
if (lastValidProbe != null) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
}
// Finally we re-filter the peaks we have using local poisson distributions with densities taken
// from either the input samples (if there are any), or the local region. The densities are
// estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
// If there is no input then the 1kb region is not used.
Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);
// Work out which stores we're using to validate against.
DataStore[] validationStores;
boolean useInput = false;
double inputCorrection = 1;
int validationNonRedCount;
if (selectedInputStores != null && selectedInputStores.length > 0) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
validationStores = selectedInputStores;
useInput = true;
// We also need to work out the total number of nonredundant seqences
// in the input so we can work out a scaling factor so that the densities
// for input and chip are comparable.
validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);
inputCorrection = totalNonRedCount / (double) validationNonRedCount;
System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection);
} else {
validationStores = selectedChIPStores;
validationNonRedCount = totalNonRedCount;
}
Vector<Probe> finalValidatedProbes = new Vector<Probe>();
for (int p = 0; p < allCandidateProbes.length; p++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
if (p % 100 == 0) {
updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length);
}
// System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());
// We now need to find the maximum read density per 2*bandwidth against which
// we're going to validate this peak
// We're going to get all reads within 10kb of the peak, and then we can subselect from there
int midPoint = allCandidateProbes[p].middle();
Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
// Get the probes for the largest region
long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);
// Deduplicate so it's a fair comparison
// Should we recalculate the redundant threshold based on the input coverage?
thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold);
int region10kbcount = thisRegionReads.length;
int region5kbcount = 0;
int region1kbcount = 0;
// Go through the reads seeing if they fit into the 5 or 1kb regions
for (int r = 0; r < thisRegionReads.length; r++) {
if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
++region5kbcount;
if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
++region1kbcount;
}
// System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);
// Convert to densities per window and ajdust for global coverage
double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
// Find the highest density to use for the validation
double highestDensity = globalDensity;
if (density10kb > highestDensity)
highestDensity = density10kb;
if (density5kb > highestDensity)
highestDensity = density5kb;
if (useInput && density1kb > highestDensity)
highestDensity = density1kb;
// System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);
// Construct a poisson distribution with this density
PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);
// System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
// Now check to see if the actual count from this peak is enough to still pass
long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance);
mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
SequenceRead.sort(mergedProbeReads);
int thisProbeOverlapCount = 0;
for (int i = 0; i < mergedProbeReads.length; i++) {
if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
++thisProbeOverlapCount;
}
}
if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
finalValidatedProbes.add(allCandidateProbes[p]);
// System.err.println("Adding probe to final set");
}
}
// System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");
ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));
generationComplete(finalSet);
}
use of org.apache.commons.math3.stat.descriptive.rank.Max in project SeqMonk by s-andrews.
the class AntisenseTranscriptionPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
double pValue = optionsPanel.pValue();
QuantitationStrandType readFilter = optionsPanel.readFilter();
long[] senseCounts = new long[data.length];
long[] antisenseCounts = new long[data.length];
Chromosome[] chrs = collection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Getting total antisense rate for chr" + chrs[c].name(), c, chrs.length * 2);
Feature[] features = getValidFeatures(chrs[c]);
for (int f = 0; f < features.length; f++) {
if (cancel) {
progressCancelled();
return;
}
Probe p = new Probe(chrs[c], features[f].location().start(), features[f].location().end(), features[f].location().strand(), features[f].name());
probes.add(p);
for (int d = 0; d < data.length; d++) {
long[] reads = data[d].getReadsForProbe(p);
for (int r = 0; r < reads.length; r++) {
if (readFilter.useRead(p, reads[r])) {
senseCounts[d] += SequenceRead.length(reads[r]);
} else {
antisenseCounts[d] += SequenceRead.length(reads[r]);
}
}
}
}
}
Probe[] allProbes = probes.toArray(new Probe[0]);
collection().setProbeSet(new ProbeSet("Features over " + optionsPanel.getSelectedFeatureType(), allProbes));
// Now we can work out the overall antisense rate
double[] antisenseProbability = new double[data.length];
for (int d = 0; d < data.length; d++) {
System.err.println("Antisense counts are " + antisenseCounts[d] + " sense counts are " + senseCounts[d]);
antisenseProbability[d] = antisenseCounts[d] / (double) (antisenseCounts[d] + senseCounts[d]);
System.err.println("Antisense probability for " + data[d].name() + " is " + antisenseProbability[d]);
}
// Now we can quantitate each individual feature and test for whether it is significantly
// showing antisense expression
ArrayList<Vector<ProbeTTestValue>> significantProbes = new ArrayList<Vector<ProbeTTestValue>>();
for (int d = 0; d < data.length; d++) {
significantProbes.add(new Vector<ProbeTTestValue>());
}
int[] readLengths = new int[data.length];
for (int d = 0; d < readLengths.length; d++) {
readLengths[d] = data[d].getMaxReadLength();
System.err.println("For " + data[d].name() + " max read len is " + readLengths[d]);
}
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
Probe[] thisChrProbes = collection().probeSet().getProbesForChromosome(chrs[c]);
for (int p = 0; p < thisChrProbes.length; p++) {
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
long senseCount = 0;
long antisenseCount = 0;
long[] reads = data[d].getReadsForProbe(thisChrProbes[p]);
for (int r = 0; r < reads.length; r++) {
if (readFilter.useRead(thisChrProbes[p], reads[r])) {
// TODO: Just count overlap?
senseCount += SequenceRead.length(reads[r]);
} else {
antisenseCount += SequenceRead.length(reads[r]);
}
}
// if (thisChrProbes[p].name().equals("RP4-798A10.2")) {
// System.err.println("Raw base counts are sense="+senseCount+" anti="+antisenseCount+" from "+reads.length+" reads");
// }
int senseReads = (int) (senseCount / readLengths[d]);
int antisenseReads = (int) (antisenseCount / readLengths[d]);
// if (thisChrProbes[p].name().equals("RP4-798A10.2")) {
// System.err.println("Raw read counts are sense="+senseReads+" anti="+antisenseReads+" from "+reads.length+" reads");
// }
BinomialDistribution bd = new BinomialDistribution(senseReads + antisenseReads, antisenseProbability[d]);
// Since the binomial distribution gives the probability of getting a value higher than
// this we need to subtract one so we get the probability of this or higher.
double thisPValue = 1 - bd.cumulativeProbability(antisenseReads - 1);
if (antisenseReads == 0)
thisPValue = 1;
// We have to add all results at this stage so we don't mess up the multiple
// testing correction later on.
significantProbes.get(d).add(new ProbeTTestValue(thisChrProbes[p], thisPValue));
double expected = ((senseReads + antisenseReads) * antisenseProbability[d]);
if (expected < 1)
expected = 1;
float obsExp = antisenseReads / (float) expected;
data[d].setValueForProbe(thisChrProbes[p], obsExp);
}
}
}
// filtering those which pass our p-value cutoff
for (int d = 0; d < data.length; d++) {
ProbeTTestValue[] ttestResults = significantProbes.get(d).toArray(new ProbeTTestValue[0]);
BenjHochFDR.calculateQValues(ttestResults);
ProbeList newList = new ProbeList(collection().probeSet(), "Antisense < " + pValue + " in " + data[d].name(), "Probes showing significant antisense transcription from a basal level of " + antisenseProbability[d] + " with a cutoff of " + pValue, "FDR");
for (int i = 0; i < ttestResults.length; i++) {
if (ttestResults[i].probe.name().equals("RP4-798A10.2")) {
System.err.println("Raw p=" + ttestResults[i].p + " q=" + ttestResults[i].q);
}
if (ttestResults[i].q < pValue) {
newList.addProbe(ttestResults[i].probe, (float) ttestResults[i].q);
}
}
}
StringBuffer quantitationDescription = new StringBuffer();
quantitationDescription.append("Antisense transcription pipeline quantitation ");
quantitationDescription.append(". Directionality was ");
quantitationDescription.append(optionsPanel.libraryTypeBox.getSelectedItem());
if (optionsPanel.ignoreOverlaps()) {
quantitationDescription.append(". Ignoring existing overlaps");
}
quantitationDescription.append(". P-value cutoff was ");
quantitationDescription.append(optionsPanel.pValue());
collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
quantitatonComplete();
}
Aggregations