use of org.gnf.clustering.DistanceMatrix in project mzmine2 by mzmine.
the class HierarAlignerGCTask method run.
/**
* @see Runnable#run()
*/
public void run() {
// Check options validity
if ((Math.abs(mzWeight) < EPSILON) && (Math.abs(rtWeight) < EPSILON)) {
setStatus(TaskStatus.ERROR);
setErrorMessage("Cannot run alignment, all the weight parameters are zero!");
return;
}
setStatus(TaskStatus.PROCESSING);
logger.info("Running join aligner");
// TIME STUFF
long startTime, endTime;
float ms;
//
if (DEBUG)
startTime = System.currentTimeMillis();
// MEMORY STUFF
Runtime run_time = Runtime.getRuntime();
Long prevTotal = 0l;
Long prevFree = run_time.freeMemory();
if (DEBUG)
printMemoryUsage(logger, run_time, prevTotal, prevFree, "START TASK...");
// - third for actual alignment
for (int i = 0; i < peakLists.length; i++) {
totalRows += peakLists[i].getNumberOfRows() * 3;
}
// Collect all data files
Vector<RawDataFile> allDataFiles = new Vector<RawDataFile>();
for (PeakList peakList : peakLists) {
for (RawDataFile dataFile : peakList.getRawDataFiles()) {
// Each data file can only have one column in aligned feature list
if (allDataFiles.contains(dataFile)) {
setStatus(TaskStatus.ERROR);
setErrorMessage("Cannot run alignment, because file " + dataFile + " is present in multiple feature lists");
return;
}
allDataFiles.add(dataFile);
}
}
// Create a new aligned feature list
alignedPeakList = new SimplePeakList(peakListName, allDataFiles.toArray(new RawDataFile[0]));
if (DEBUG)
printMemoryUsage(logger, run_time, prevTotal, prevFree, "COMPOUND DETECTED");
/**
* Alignment mapping *
*/
// Iterate source feature lists
Hashtable<SimpleFeature, Double> rtPeaksBackup = new Hashtable<SimpleFeature, Double>();
Hashtable<PeakListRow, Object[]> infoRowsBackup = new Hashtable<PeakListRow, Object[]>();
// Since clustering is now order independent, option removed!
// Build comparison order
ArrayList<Integer> orderIds = new ArrayList<Integer>();
for (int i = 0; i < peakLists.length; ++i) {
orderIds.add(i);
}
Integer[] newIds = orderIds.toArray(new Integer[orderIds.size()]);
//
// TriangularMatrix distances = null;
DistanceMatrix distancesGNF_Tri = null;
DistanceMatrix distancesGNF_Tri_Bkp = null;
int nbPeaks = 0;
for (int i = 0; i < newIds.length; ++i) {
PeakList peakList = peakLists[newIds[i]];
nbPeaks += peakList.getNumberOfRows();
}
// If 'Hybrid' or no distance matrix: no need for a matrix
if (CLUSTERER_TYPE == ClustererType.HYBRID || !saveRAMratherThanCPU_1) {
// distances = new double[nbPeaks][nbPeaks];
int nRowCount = nbPeaks;
distancesGNF_Tri = new DistanceMatrixTriangular1D2D(nRowCount);
}
full_rows_list = new ArrayList<>();
for (int i = 0; i < newIds.length; ++i) {
PeakList peakList = peakLists[newIds[i]];
PeakListRow[] allRows = peakList.getRows();
for (int j = 0; j < allRows.length; ++j) {
PeakListRow row = allRows[j];
full_rows_list.add(row);
}
}
RowVsRowDistanceProvider distProvider = new RowVsRowDistanceProvider(project, // rtAdjustementMapping,
full_rows_list, mzWeight, rtWeight, // rtToleranceAfter,
maximumScore);
// If 'Hybrid' or no distance matrix: no need for a matrix
if (CLUSTERER_TYPE == ClustererType.HYBRID || !saveRAMratherThanCPU_1) {
for (int x = 0; x < nbPeaks; ++x) {
for (int y = x; y < nbPeaks; ++y) {
float dist = (float) distProvider.getRankedDistance(x, y, mzTolerance.getMzTolerance(), rtTolerance.getTolerance(), minScore);
// if (CLUSTERER_TYPE == ClustererType.CLASSIC_OLD)
// distances.set(x, y , dist);
// else
distancesGNF_Tri.setValue(x, y, dist);
}
processedRows++;
if (DEBUG)
logger.info("Treating lists: " + (Math.round(100 * processedRows / (double) nbPeaks)) + " %");
}
}
if (DEBUG)
printMemoryUsage(logger, run_time, prevTotal, prevFree, "DISTANCES COMPUTED");
// ////
// Math.abs(row.getBestPeak().getRT() -
double max_dist = maximumScore;
// k_row.getBestPeak().getRT()) /
// ((RangeUtils.rangeLength(rtRange) /
// 2.0));
// String newickCluster;
List<List<Integer>> gnfClusters = null;
// ////
boolean do_verbose = true;
boolean do_cluster = true;
boolean do_print = (exportDendrogramAsTxt);
boolean do_data = false;
org.gnf.clustering.Node[] arNodes = null;
int nRowCount = full_rows_list.size();
String[] rowNames = null;
if (do_print) {
rowNames = new String[nRowCount];
for (int i = 0; i < nRowCount; i++) {
// rowNames[i] = "ID_" + i + "_" +
// full_rows_list.get(i).getID();
Feature peak = full_rows_list.get(i).getBestPeak();
double rt = peak.getRT();
int end = peak.getDataFile().getName().indexOf(" ");
String short_fname = peak.getDataFile().getName().substring(0, end);
rowNames[i] = "@" + rtFormat.format(rt) + "^[" + short_fname + "]";
}
}
String outputPrefix = null;
if (CLUSTERER_TYPE == ClustererType.CLASSIC) {
// Pure Hierar!
outputPrefix = "hierar_0";
throw new IllegalStateException("'" + ClustererType.CLASSIC.toString() + "' algorithm not yet implemented!");
} else if (CLUSTERER_TYPE == ClustererType.CACHED) {
// TODO: ...!
if (DEBUG_2)
logger.info(distancesGNF_Tri.toString());
if (saveRAMratherThanCPU_2) {
// Requires: distances values will be
// recomputed on demand during
// "getValidatedClusters_3()"
// No duplicate backup storage!
distancesGNF_Tri_Bkp = null;
} else {
// Otherwise, backing up the distance matrix (matrix being
// deeply changed during "clusterDM()", then no more
// exploitable)
distancesGNF_Tri_Bkp = new DistanceMatrixTriangular1D2D(distancesGNF_Tri);
if (DEBUG)
printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER BACKUP MATRIX");
}
if (DEBUG)
logger.info("Clustering...");
if (distancesGNF_Tri != null)
arNodes = org.gnf.clustering.sequentialcache.SequentialCacheClustering.clusterDM(distancesGNF_Tri, linkageStartegyType, null, nRowCount);
distancesGNF_Tri = null;
System.gc();
if (DEBUG)
printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER DONE");
if (DEBUG_2)
logger.info(distancesGNF_Tri.toString());
if (DEBUG_2)
for (int i = 0; i < arNodes.length; i++) {
logger.info("Node " + i + ": " + arNodes[i]);
}
// TODO: Use usual interfacing ...
// ClusteringResult<org.gnf.clustering.Node> clust_res = new
// ClusteringResult<>(
// Arrays.asList(arNodes), null, 0, null);
outputPrefix = "hierar_1";
} else if (CLUSTERER_TYPE == ClustererType.HYBRID) {
throw new IllegalStateException("'" + ClustererType.HYBRID.toString() + "' algorithm not yet implemented!");
}
// Sort Nodes by correlation score (Required in
// 'getValidatedClusters_3')
int[] rowOrder = new int[nRowCount];
if (DEBUG)
logger.info("Sorting tree nodes...");
org.gnf.clustering.Utils.NodeSort(arNodes, nRowCount - 2, 0, rowOrder);
if (do_cluster) {
gnfClusters = getValidatedClusters_3(arNodes, 0.0f, newIds.length, max_dist, distancesGNF_Tri_Bkp, distProvider);
// -- Print
if (DEBUG_2 && do_verbose)
for (int i = 0; i < gnfClusters.size(); i++) {
List<Integer> cl = gnfClusters.get(i);
String str = "";
for (int j = 0; j < cl.size(); j++) {
int r = cl.get(j);
str += cl.get(j) + "^(" + full_rows_list.get(r).getID() + ", " + full_rows_list.get(r).getAverageRT() + ")" + " ";
}
logger.info(str);
}
}
// File output
int ext_pos = dendrogramTxtFilename.getAbsolutePath().lastIndexOf(".");
outputPrefix = dendrogramTxtFilename.getAbsolutePath().substring(0, ext_pos);
String outGtr = outputPrefix + ".gtr";
String outCdt = outputPrefix + ".cdt";
if (DEBUG)
logger.info("Writing output to file...");
int nColCount = 1;
String[] colNames = new String[nColCount];
colNames[nColCount - 1] = "Id";
String sep = "\t";
if (do_print) {
try {
float[] arFloats = new float[nRowCount];
for (int i = 0; i < arFloats.length; i++) {
arFloats[i] = i / 2.0f;
}
DataSource source = (do_data) ? new FloatSource1D(arFloats, nRowCount, nColCount) : null;
/* org.gnf.clustering.Utils. */
HierarAlignerGCTask.GenerateCDT(outCdt, source, /* null */
nRowCount, nColCount, sep, rowNames, colNames, rowOrder);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
org.gnf.clustering.Utils.WriteTreeToFile(outGtr, nRowCount - 1, arNodes, true);
if (DEBUG)
printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER FILES PRINTED");
}
// //// Arrange row clustered list with method 0,1,2
List<List<PeakListRow>> clustersList = new ArrayList<>();
// Build feature list row clusters
for (List<Integer> cl : gnfClusters) {
List<PeakListRow> rows_cluster = new ArrayList<>();
for (int i = 0; i < cl.size(); i++) {
rows_cluster.add(full_rows_list.get(cl.get(i)));
}
clustersList.add(rows_cluster);
//
processedRows += rows_cluster.size();
}
if (DEBUG)
printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER CLUSTER_LIST");
// Fill alignment table: One row per cluster
for (List<PeakListRow> cluster : clustersList) {
if (isCanceled())
return;
PeakListRow targetRow = new SimplePeakListRow(newRowID);
newRowID++;
alignedPeakList.addRow(targetRow);
//
infoRowsBackup.put(targetRow, new Object[] { new HashMap<RawDataFile, Double[]>(), new HashMap<RawDataFile, PeakIdentity>(), new HashMap<RawDataFile, Double>() });
for (PeakListRow row : cluster) {
// Add all non-existing identities from the original row to the
// aligned row
// Set the preferred identity
targetRow.setPreferredPeakIdentity(row.getPreferredPeakIdentity());
// for (RawDataFile file : row.getRawDataFiles()) {
for (RawDataFile file : alignedPeakList.getRawDataFiles()) {
if (Arrays.asList(row.getRawDataFiles()).contains(file)) {
Feature originalPeak = row.getPeak(file);
if (originalPeak != null) {
targetRow.addPeak(file, originalPeak);
} else {
setStatus(TaskStatus.ERROR);
setErrorMessage("Cannot run alignment, no originalPeak");
return;
}
}
}
// present
for (PeakIdentity identity : row.getPeakIdentities()) {
PeakIdentity clonedIdentity = (PeakIdentity) identity.clone();
if (!PeakUtils.containsIdentity(targetRow, clonedIdentity))
targetRow.addPeakIdentity(clonedIdentity, false);
}
// processedRows++;
}
}
// of the "targetRow.update()" used down there
for (SimpleFeature peak : rtPeaksBackup.keySet()) {
peak.setRT((double) rtPeaksBackup.get(peak));
}
/**
* Post-processing... *
*/
// Build reference RDFs index: We need an ordered reference here, to be
// able to parse
// correctly while reading back stored info
RawDataFile[] rdf_sorted = alignedPeakList.getRawDataFiles().clone();
Arrays.sort(rdf_sorted, new RawDataFileSorter(SortingDirection.Ascending));
// Process
for (PeakListRow targetRow : infoRowsBackup.keySet()) {
if (isCanceled())
return;
// Refresh averaged RTs...
((SimplePeakListRow) targetRow).update();
}
//
if (DEBUG) {
endTime = System.currentTimeMillis();
ms = (endTime - startTime);
logger.info("## >> Whole JoinAlignerGCTask processing took " + Float.toString(ms) + " ms.");
}
// ----------------------------------------------------------------------
// Add new aligned feature list to the project
this.project.addPeakList(alignedPeakList);
if (DEBUG) {
for (RawDataFile rdf : alignedPeakList.getRawDataFiles()) logger.info("RDF: " + rdf);
}
// Add task description to peakList
alignedPeakList.addDescriptionOfAppliedTask(new SimplePeakListAppliedMethod(HierarAlignerGCTask.TASK_NAME, parameters));
logger.info("Finished join aligner GC");
setStatus(TaskStatus.FINISHED);
}
Aggregations