use of ini.trakem2.vector.Editions in project TrakEM2 by trakem2.
the class Compare method reliabilityAnalysis.
public static final Bureaucrat reliabilityAnalysis(final String[] ignore, final boolean output_arff, final boolean weka_classify, final boolean show_dialog, final double delta, final double wi, final double wd, final double wm) {
// gather all open projects
final Project[] p = Project.getProjects().toArray(new Project[0]);
final Worker worker = new Worker("Reliability by name") {
@Override
public void run() {
startedWorking();
try {
final CATAParameters cp = new CATAParameters();
cp.delta = delta;
if (show_dialog && !cp.setup(false, null, false, false)) {
finishedWorking();
return;
}
Object[] ob = gatherChains(p, cp, ignore);
final ArrayList<Chain> chains = (ArrayList<Chain>) ob[0];
// to keep track of each project's chains
final ArrayList[] p_chains = (ArrayList[]) ob[1];
ob = null;
if (null == chains) {
finishedWorking();
return;
}
// For each pipe in a brain:
// - score against all other brains in which that pipe name exists,
// - record the score position within that brain.
//
final ExecutorService exec = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
// for each individual lineage:
final TreeMap<String, ArrayList<Integer>> indices = new TreeMap<String, ArrayList<Integer>>();
final ArrayList<CITuple> cin = new ArrayList<CITuple>();
// for each family:
final TreeMap<String, ArrayList<Integer>> indices_f = new TreeMap<String, ArrayList<Integer>>();
final ArrayList<CITuple> cin_f = new ArrayList<CITuple>();
final ArrayList<Future> fus = new ArrayList<Future>();
// For neural network analysis:
final StringBuilder arff = output_arff ? new StringBuilder("@RELATION Lineages\n\n") : null;
if (output_arff) {
arff.append("@ATTRIBUTE APD NUMERIC\n");
arff.append("@ATTRIBUTE CPD NUMERIC\n");
arff.append("@ATTRIBUTE STD NUMERIC\n");
arff.append("@ATTRIBUTE MPD NUMERIC\n");
arff.append("@ATTRIBUTE PM NUMERIC\n");
arff.append("@ATTRIBUTE LEV NUMERIC\n");
arff.append("@ATTRIBUTE SIM NUMERIC\n");
arff.append("@ATTRIBUTE PRX NUMERIC\n");
arff.append("@ATTRIBUTE PRM NUMERIC\n");
// length ratio: len(query) / len(ref)
arff.append("@ATTRIBUTE LR NUMERIC\n");
arff.append("@ATTRIBUTE TR NUMERIC\n");
arff.append("@ATTRIBUTE CLASS {false,true}\n");
arff.append("\n@DATA\n");
}
// Count number of times when decision tree says it's good, versus number of times when it should be good
// observed
final AtomicInteger obs_good = new AtomicInteger(0);
// observed wrong
final AtomicInteger obs_wrong = new AtomicInteger(0);
// expected
final AtomicInteger exp_good = new AtomicInteger(0);
final AtomicInteger exp_bad = new AtomicInteger(0);
final AtomicInteger obs_bad_classified_good_ones = new AtomicInteger(0);
final AtomicInteger obs_well_classified_bad_ones = new AtomicInteger(0);
// inc by one when a lineage to compare is not found at all in the brain that works as reference
final AtomicInteger not_found = new AtomicInteger(0);
final AtomicInteger already_classified = new AtomicInteger(0);
Method classify_ = null;
if (weka_classify) {
try {
classify_ = Class.forName("lineage.LineageClassifier").getDeclaredMethod("classify", new Class[] { double[].class });
} catch (final Exception e) {
IJError.print(e);
}
}
final Method classify = classify_;
// All possible pairs of projects, with repetition (it's not the same, although the pipe pairwise comparison itself will be.)
for (int _i = 0; _i < p_chains.length; _i++) {
final int i = _i;
Utils.log2("Project " + p[i] + " has " + p_chains[i].size() + " chains.");
for (int _j = 0; _j < p_chains.length; _j++) {
final int j = _j;
// skip same project (would have a score of zero, identical.)
if (i == j)
continue;
final String[] titles_j = new String[p_chains[j].size()];
int next = 0;
for (final Chain cj : (ArrayList<Chain>) p_chains[j]) {
final String t = cj.getCellTitle();
titles_j[next++] = t.substring(0, t.indexOf(' '));
}
// families:
final TreeSet<String> ts_families = new TreeSet<String>();
for (int f = 0; f < titles_j.length; f++) {
// extract family name from title: read the first continuous string of capital letters
final String title = titles_j[f];
int u = 0;
for (; u < title.length(); u++) {
if (!Character.isUpperCase(title.charAt(u)))
break;
}
ts_families.add(title.substring(0, u));
}
final ArrayList<String> families = new ArrayList<String>(ts_families);
fus.add(exec.submit(new Callable() {
@Override
public Object call() {
// All chains of one project to all chains of the other:
for (final Chain chain : (ArrayList<Chain>) p_chains[i]) {
final VectorString3D vs1 = chain.vs;
// Prepare title
String title = chain.getCellTitle();
title = title.substring(0, title.indexOf(' '));
// check if the other project j contains a chain of name chain.getCellTitle() up to the space.
int title_index = -1;
for (int k = 0; k < titles_j.length; k++) {
if (title.equals(titles_j[k])) {
title_index = k;
break;
}
}
if (-1 == title_index) {
Utils.log2(title + " not found in project " + p[j]);
if (weka_classify)
not_found.incrementAndGet();
continue;
}
// should be there:
if (weka_classify) {
exp_good.incrementAndGet();
exp_bad.addAndGet(titles_j.length - 1);
}
final ArrayList<ChainMatch> list = new ArrayList<ChainMatch>();
// extract family name from title: read the first continuous string of capital letters
int u = 0;
for (; u < title.length(); u++) {
if (!Character.isUpperCase(title.charAt(u)))
break;
}
final String family_name = title.substring(0, u);
String last_classify = null;
int g = 0;
for (final Chain cj : (ArrayList<Chain>) p_chains[j]) {
final VectorString3D vs2 = cj.vs;
final Object[] ob = findBestMatch(vs1, vs2, cp.delta, cp.skip_ends, cp.max_mut, cp.min_chunk, cp.distance_type, cp.direct, cp.substring_matching, wi, wd, wm);
final Editions ed = (Editions) ob[0];
final double[] stats = ed.getStatistics(cp.skip_ends, cp.max_mut, cp.min_chunk, cp.score_mut_only);
final ChainMatch cm = new ChainMatch(cj, null, ed, stats, score(ed.getSimilarity(), ed.getDistance(), stats[3], Compare.W));
cm.title = titles_j[g];
list.add(cm);
g++;
if (weka_classify) {
// from decision tree: is it good?
final double[] param = new double[11];
for (int p = 0; p < stats.length; p++) param[p] = stats[p];
try {
if (((Boolean) classify.invoke(null, param)).booleanValue()) {
if (null != last_classify) {
Utils.log2("ALREADY CLASSIFIED " + title + " as " + last_classify + " (now: " + cm.title + " )");
already_classified.incrementAndGet();
}
last_classify = cm.title;
if (title.equals(cm.title)) {
obs_good.incrementAndGet();
} else {
Utils.log2("WRONG CLASSIFICATION of " + title + " as " + cm.title);
obs_wrong.incrementAndGet();
}
} else {
if (title.equals(cm.title)) {
obs_bad_classified_good_ones.incrementAndGet();
} else {
obs_well_classified_bad_ones.incrementAndGet();
}
}
} catch (final Exception ee) {
IJError.print(ee);
}
}
}
// sort scores:
Compare.sortMatches(list, cp.distance_type, cp.distance_type_2, cp.min_matches);
if (output_arff) {
// Take top 8 and put them into training set for WEKA in arff format
for (int h = 0; h < 8; h++) {
final ChainMatch cm = list.get(h);
final StringBuilder sb = new StringBuilder();
sb.append(cm.phys_dist).append(',').append(cm.cum_phys_dist).append(',').append(cm.stdDev).append(',').append(cm.median).append(',').append(cm.prop_mut).append(',').append(cm.ed.getDistance()).append(',').append(cm.seq_sim).append(',').append(cm.proximity).append(',').append(cm.proximity_mut).append(',').append(cm.prop_len).append(',').append(cm.tortuosity_ratio).append(',').append(title.equals(cm.title)).append(// append('-').append(cm.title.startsWith(family_name)).append('\n');
'\n');
synchronized (arff) {
arff.append(sb);
}
}
}
// record scoring index
int f = 0;
boolean found_specific = false;
boolean found_family = false;
for (final ChainMatch cm : list) {
// Exact match: for each individual lineage
if (!found_specific && title.equals(cm.title)) {
synchronized (indices) {
ArrayList<Integer> al = indices.get(title);
if (null == al) {
al = new ArrayList<Integer>();
indices.put(title, al);
// so I can keep a list of chains sorted by name
cin.add(new CITuple(title, chain, al));
}
al.add(f);
}
found_specific = true;
}
if (!found_family && cm.title.startsWith(family_name)) {
synchronized (indices_f) {
ArrayList<Integer> al = indices_f.get(family_name);
if (null == al) {
al = new ArrayList<Integer>();
indices_f.put(family_name, al);
cin_f.add(new CITuple(family_name, chain, al));
}
al.add(f);
}
found_family = true;
}
if (found_specific && found_family) {
break;
}
//
f++;
}
if (!found_specific) {
Utils.log2("NOT FOUND any match for " + title + " within a list of size " + list.size() + ", in project " + chain.getRoot().getProject());
}
}
return null;
}
}));
}
}
for (final Future fu : fus) {
try {
fu.get();
} catch (final Exception e) {
IJError.print(e);
}
}
exec.shutdownNow();
if (weka_classify) {
// so stateful ... it's a sin.
try {
Class.forName("lineage.LineageClassifier").getDeclaredMethod("flush", new Class[] {}).invoke(null, new Object[] {});
} catch (final Exception e) {
IJError.print(e);
}
}
// export ARFF for neural network training
if (output_arff) {
Utils.saveToFile(new File(System.getProperty("user.dir") + "/lineages.arff"), arff.toString());
}
// Show the results from indices map
final StringBuilder sb = new StringBuilder();
// scoring index vs count of occurrences
final TreeMap<Integer, Integer> sum = new TreeMap<Integer, Integer>();
// best scoring index of best family member vs count of ocurrences
final TreeMap<Integer, Integer> sum_f = new TreeMap<Integer, Integer>();
// scoring index vs count of ocurrences, within each family
final TreeMap<String, TreeMap<Integer, Integer>> sum_fw = new TreeMap<String, TreeMap<Integer, Integer>>();
// From collected data, several kinds of results:
// - a list of how well each chain scores: its index position in the sorted list of scores of one to many.
// - a list of how well each chain scores relative to family: the lowest (best) index position of a lineage of the same family in the sorted list of scores.
sb.append("List of scoring indices for each (starting at index 1, aka best possible score):\n");
for (final CITuple ci : cin) {
// sort indices in place
Collections.sort(ci.list);
// count occurrences of each scoring index
// lowest possible index
int last = 0;
int count = 1;
for (final int i : ci.list) {
if (last == i)
count++;
else {
sb.append(ci.title).append(' ').append(last + 1).append(' ').append(count).append('\n');
// reset
last = i;
count = 1;
}
// global count of occurrences
final Integer oi = new Integer(i);
sum.put(oi, (sum.containsKey(oi) ? sum.get(oi) : 0) + 1);
// Same thing but not for all lineages, but only for lineages within a family:
// extract family name from title: read the first continuous string of capital letters
int u = 0;
for (; u < ci.title.length(); u++) {
if (!Character.isUpperCase(ci.title.charAt(u)))
break;
}
final String family_name = ci.title.substring(0, u);
TreeMap<Integer, Integer> sfw = sum_fw.get(family_name);
if (null == sfw) {
sfw = new TreeMap<Integer, Integer>();
sum_fw.put(family_name, sfw);
}
sfw.put(oi, (sfw.containsKey(oi) ? sfw.get(oi) : 0) + 1);
}
if (0 != count)
sb.append(ci.title).append(' ').append(last + 1).append(' ').append(count).append('\n');
// find the very-off ones:
if (last > 6) {
Utils.log2("BAD index " + last + " for chain " + ci.title + " " + ci.chain.getRoot() + " of project " + ci.chain.getRoot().getProject());
}
}
sb.append("===============================\n");
// / family score:
for (final CITuple ci : cin_f) {
// sort indices in place
Collections.sort(ci.list);
// count occurrences of each scoring index
// lowest possible index
int last = 0;
int count = 1;
for (final int i : ci.list) {
if (last == i)
count++;
else {
// reset
last = i;
count = 1;
}
// global count of occurrences
final Integer oi = new Integer(i);
sum_f.put(oi, (sum_f.containsKey(oi) ? sum_f.get(oi) : 0) + 1);
}
}
sb.append("===============================\n");
// - a summarizing histogram that collects how many 1st, how many 2nd, etc. in total, normalized to total number of one-to-many matches performed (i.e. the number of scoring indices recorded.)
//
{
sb.append("Global count of index ocurrences:\n");
int total = 0;
int top2 = 0;
int top5 = 0;
for (final Map.Entry<Integer, Integer> e : sum.entrySet()) {
sb.append(e.getKey()).append(' ').append(e.getValue()).append('\n');
total += e.getValue();
if (e.getKey() < 2)
top2 += e.getValue();
if (e.getKey() < 5)
top5 += e.getValue();
}
sb.append("total: ").append(total).append('\n');
sb.append("top1: ").append(sum.get(sum.firstKey()) / (float) total).append('\n');
sb.append("top2: ").append(top2 / (float) total).append('\n');
sb.append("top5: ").append(top5 / (float) total).append('\n');
sb.append("===============================\n");
}
sb.append("Family-wise count of index ocurrences:\n");
for (final Map.Entry<String, TreeMap<Integer, Integer>> fe : sum_fw.entrySet()) {
int total = 0;
int top5 = 0;
for (final Map.Entry<Integer, Integer> e : fe.getValue().entrySet()) {
sb.append(fe.getKey()).append(' ').append(e.getKey()).append(' ').append(e.getValue()).append('\n');
total += e.getValue();
if (e.getKey() < 5)
top5 += e.getValue();
}
sb.append("total: ").append(total).append('\n');
sb.append("top1: ").append(fe.getValue().get(fe.getValue().firstKey()) / (float) total).append('\n');
sb.append("top5: ").append(top5 / (float) total).append('\n');
}
sb.append("===============================\n");
// - the percent of first score being the correct one:
double first = 0;
double first_5 = 0;
double all = 0;
for (final Map.Entry<Integer, Integer> e : sum.entrySet()) {
final int k = e.getKey();
final int a = e.getValue();
all += a;
if (0 == k)
first = a;
if (k < 5)
first_5 += a;
}
// STORE
this.result = new double[] { // Top one ratio
first / all, // Top 5 ratio
first_5 / all };
sb.append("Global count of index occurrences family-wise:\n");
for (final Map.Entry<Integer, Integer> e : sum_f.entrySet()) {
sb.append(e.getKey()).append(' ').append(e.getValue()).append('\n');
}
sb.append("===============================\n");
// - a summarizing histogram of how well each chain scores (4/4, 3/4, 2/4, 1/4, 0/4 only for those that have 4 homologous members.)
// Must consider that there are 5 projects taken in pairs with repetition.
sb.append("A summarizing histogram of how well each chain scores, for those that have 4 homologous members. It's the number of 1st scores (zeroes) versus the total number of scores:\n");
// First, classify them in having 4, 3, 2, 1
// For 5 brains: 5! / (5-2)! = 5 * 4 = 20 --- 5 elements taken in groups of 2, where order matters
// For 4 brains: 4! / (4-2)! = 4 * 3 = 12
// For 3 brains: 3! / (3-2)! = 3 * 2 = 6;
final TreeMap<Integer, ArrayList<String>> hsc = new TreeMap<Integer, ArrayList<String>>();
for (final CITuple ci : cin) {
final int size = ci.list.size();
ArrayList<String> al = hsc.get(size);
if (null == al) {
al = new ArrayList<String>();
hsc.put(size, al);
}
// Count the number of 0s -- top scoring
int count = 0;
for (final Integer i : ci.list) {
if (0 == i)
count++;
else
break;
}
al.add(new StringBuffer(ci.title).append(" =").append(count).append('/').append(ci.list.size()).append('\n').toString());
}
// Then just print:
for (final Map.Entry<Integer, ArrayList<String>> e : hsc.entrySet()) {
sb.append("For ").append(e.getKey()).append(" matches:\n");
for (final String s : e.getValue()) sb.append(s);
}
sb.append("=========================\n");
// Family-wise, count the number of zeros per family:
sb.append("Number of top scoring per family:\n");
final TreeMap<String, String> family_scores = new TreeMap<String, String>();
for (final CITuple ci : cin_f) {
int count = 0;
for (final Integer i : ci.list) {
if (0 == i)
count++;
else
// ci.list is sorted
break;
}
family_scores.put(ci.title, new StringBuilder().append(ci.title).append(" =").append(count).append('/').append(ci.list.size()).append('\n').toString());
}
// Now print sorted by family name:
for (final String s : family_scores.values()) {
sb.append(s);
}
sb.append("=========================\n");
if (weka_classify) {
sb.append("Decision tree:\n");
sb.append("Expected good matches: " + exp_good.get() + "\n");
sb.append("Expected bad matches: " + exp_bad.get() + "\n");
sb.append("Observed good matches: " + obs_good.get() + "\n");
sb.append("Observed bad matches: " + obs_wrong.get() + "\n");
sb.append("Observed well classified bad ones: " + obs_well_classified_bad_ones.get() + "\n");
sb.append("Observed bad classified good ones: " + obs_bad_classified_good_ones.get() + "\n");
sb.append("Not found, so skipped: " + not_found.get() + "\n");
sb.append("Already classified: " + already_classified.get() + "\n");
sb.append("=========================\n");
}
if (output_arff) {
Utils.log(sb.toString());
} else {
Utils.log2(sb.toString());
}
} catch (final Exception e) {
e.printStackTrace();
} finally {
finishedWorking();
}
}
};
return Bureaucrat.createAndStart(worker, p);
}
use of ini.trakem2.vector.Editions in project TrakEM2 by trakem2.
the class Compare method condense.
/**
* Do an all-to-all distance matrix of the given vs, then do a neighbor joining, do a weighted merge of the two VectorString3D being merged, and then finally output the resulting condensed unique VectorString3D with its source array full with all points that make each point in it. Expects VectorString3D which are already calibrated and transformed.
*/
public static VectorString3D condense(final CATAParameters cp, final VectorString3D[] vs, final Worker worker) throws Exception {
// Trivial case 1:
if (1 == vs.length)
return vs[0];
// Estimate delta
if (0 == cp.delta) {
for (int i = 0; i < vs.length; i++) {
cp.delta += vs[i].getAverageDelta();
}
cp.delta /= vs.length;
}
// Resample all:
for (int i = 0; i < vs.length; i++) vs[i].resample(cp.delta, true);
// Trivial case 2:
try {
if (2 == vs.length)
VectorString3D.createInterpolatedPoints(new Editions(vs[0], vs[1], cp.delta, false), 0.5f);
} catch (final Exception e) {
IJError.print(e);
return null;
}
// Else, do neighbor joining
final float[][] scores = Compare.scoreAllToAll(vs, cp.distance_type, cp.delta, cp.skip_ends, cp.max_mut, cp.min_chunk, cp.direct, cp.substring_matching, worker);
final HashMap<Compare.Cell<VectorString3D>, Float> table = new HashMap<Compare.Cell<VectorString3D>, Float>();
// Input the half matrix only into the table, since it's mirrored. And without the diagonal of zeros:
for (int i = 1; i < scores.length; i++) {
for (int j = 0; j < i; j++) {
table.put(new Cell<VectorString3D>(vs[i], vs[j]), scores[i][j]);
}
}
final HashSet<VectorString3D> remaining = new HashSet<VectorString3D>();
for (final VectorString3D v : vs) remaining.add(v);
while (table.size() > 0) {
if (null != worker && worker.hasQuitted()) {
return null;
}
// find smallest value
float min = Float.MAX_VALUE;
Cell<VectorString3D> cell = null;
for (final Map.Entry<Cell<VectorString3D>, Float> e : table.entrySet()) {
final float f = e.getValue();
if (f < min) {
min = f;
cell = e.getKey();
}
}
// done below//table.remove(cell);
for (final Iterator<Cell<VectorString3D>> it = table.keySet().iterator(); it.hasNext(); ) {
final Cell<VectorString3D> c = it.next();
if (c.t1 == cell.t1 || c.t2 == cell.t2 || c.t2 == cell.t1 || c.t1 == cell.t2) {
it.remove();
}
}
// pop the two merged VectorString3D
remaining.remove(cell.t1);
remaining.remove(cell.t2);
// merge, weighted by number of sources of each
// in createInterpolated, the alpha is the opposite of what one would think: a 0.2 alpha means 0.8 for the first and 0.2 for the second. So alpha should be 1-alpha
final double alpha = (double) (cell.t1.getNSources()) / (double) (cell.t1.getNSources() + cell.t2.getNSources());
final Editions eds = new Editions(cell.t1, cell.t2, cp.delta, false);
VectorString3D vs_merged = null;
if (cp.cut_uneven_ends) {
// crop ends to eliminate strings of insertions or deletions sparsed by strings of max cp.max_mut mutations inside
// (This reduces or eliminates variability noise caused by unequal sequence length)
final int[][] editions = eds.getEditions();
int first = 0;
int last = editions.length - 1;
int n_mut = 0;
for (int i = 0; i < last; i++) {
if (Editions.MUTATION == editions[i][0]) {
n_mut++;
if (n_mut > cp.max_mut) {
first = i - n_mut + 1;
break;
}
}
}
// reset
n_mut = 0;
for (int i = last; i > first; i--) {
if (Editions.MUTATION == editions[i][0]) {
n_mut++;
if (n_mut > cp.max_mut) {
last = i + n_mut - 1;
break;
}
}
}
vs_merged = VectorString3D.createInterpolatedPoints(eds, alpha, first, last);
} else {
vs_merged = VectorString3D.createInterpolatedPoints(eds, alpha);
}
vs_merged.resample(cp.delta, true);
// add a new cell for each possible comparison with all other unique vs
for (final VectorString3D v : remaining) {
final Object[] ob = findBestMatch(vs_merged, v, cp.delta, cp.skip_ends, cp.max_mut, cp.min_chunk, cp.distance_type, cp.direct, cp.substring_matching);
final Editions ed = (Editions) ob[0];
final float score = (float) getScore(ed, cp.skip_ends, cp.max_mut, cp.min_chunk, cp.distance_type);
table.put(new Cell<VectorString3D>(vs_merged, v), score);
}
// add the new VectorString3D
remaining.add(vs_merged);
}
// test:
if (1 != remaining.size()) {
Utils.log2("WARNING: remaining.size() == " + remaining.size());
}
return remaining.iterator().next();
}
Aggregations