Search in sources :

Example 1 with DataSource

use of org.gnf.clustering.DataSource in project mzmine2 by mzmine.

the class HierarAlignerGCTask method run.

/**
 * @see Runnable#run()
 */
public void run() {
    // Check options validity
    if ((Math.abs(mzWeight) < EPSILON) && (Math.abs(rtWeight) < EPSILON)) {
        setStatus(TaskStatus.ERROR);
        setErrorMessage("Cannot run alignment, all the weight parameters are zero!");
        return;
    }
    setStatus(TaskStatus.PROCESSING);
    logger.info("Running join aligner");
    // TIME STUFF
    long startTime, endTime;
    float ms;
    // 
    if (DEBUG)
        startTime = System.currentTimeMillis();
    // MEMORY STUFF
    Runtime run_time = Runtime.getRuntime();
    Long prevTotal = 0l;
    Long prevFree = run_time.freeMemory();
    if (DEBUG)
        printMemoryUsage(logger, run_time, prevTotal, prevFree, "START TASK...");
    // - third for actual alignment
    for (int i = 0; i < peakLists.length; i++) {
        totalRows += peakLists[i].getNumberOfRows() * 3;
    }
    // Collect all data files
    Vector<RawDataFile> allDataFiles = new Vector<RawDataFile>();
    for (PeakList peakList : peakLists) {
        for (RawDataFile dataFile : peakList.getRawDataFiles()) {
            // Each data file can only have one column in aligned feature list
            if (allDataFiles.contains(dataFile)) {
                setStatus(TaskStatus.ERROR);
                setErrorMessage("Cannot run alignment, because file " + dataFile + " is present in multiple feature lists");
                return;
            }
            allDataFiles.add(dataFile);
        }
    }
    // Create a new aligned feature list
    alignedPeakList = new SimplePeakList(peakListName, allDataFiles.toArray(new RawDataFile[0]));
    if (DEBUG)
        printMemoryUsage(logger, run_time, prevTotal, prevFree, "COMPOUND DETECTED");
    /**
     * Alignment mapping *
     */
    // Iterate source feature lists
    Hashtable<SimpleFeature, Double> rtPeaksBackup = new Hashtable<SimpleFeature, Double>();
    Hashtable<PeakListRow, Object[]> infoRowsBackup = new Hashtable<PeakListRow, Object[]>();
    // Since clustering is now order independent, option removed!
    // Build comparison order
    ArrayList<Integer> orderIds = new ArrayList<Integer>();
    for (int i = 0; i < peakLists.length; ++i) {
        orderIds.add(i);
    }
    Integer[] newIds = orderIds.toArray(new Integer[orderIds.size()]);
    // 
    // TriangularMatrix distances = null;
    DistanceMatrix distancesGNF_Tri = null;
    DistanceMatrix distancesGNF_Tri_Bkp = null;
    int nbPeaks = 0;
    for (int i = 0; i < newIds.length; ++i) {
        PeakList peakList = peakLists[newIds[i]];
        nbPeaks += peakList.getNumberOfRows();
    }
    // If 'Hybrid' or no distance matrix: no need for a matrix
    if (CLUSTERER_TYPE == ClustererType.HYBRID || !saveRAMratherThanCPU_1) {
        // distances = new double[nbPeaks][nbPeaks];
        int nRowCount = nbPeaks;
        distancesGNF_Tri = new DistanceMatrixTriangular1D2D(nRowCount);
    }
    full_rows_list = new ArrayList<>();
    for (int i = 0; i < newIds.length; ++i) {
        PeakList peakList = peakLists[newIds[i]];
        PeakListRow[] allRows = peakList.getRows();
        for (int j = 0; j < allRows.length; ++j) {
            PeakListRow row = allRows[j];
            full_rows_list.add(row);
        }
    }
    RowVsRowDistanceProvider distProvider = new RowVsRowDistanceProvider(project, // rtAdjustementMapping,
    full_rows_list, mzWeight, rtWeight, // rtToleranceAfter,
    maximumScore);
    // If 'Hybrid' or no distance matrix: no need for a matrix
    if (CLUSTERER_TYPE == ClustererType.HYBRID || !saveRAMratherThanCPU_1) {
        for (int x = 0; x < nbPeaks; ++x) {
            for (int y = x; y < nbPeaks; ++y) {
                float dist = (float) distProvider.getRankedDistance(x, y, mzTolerance.getMzTolerance(), rtTolerance.getTolerance(), minScore);
                // if (CLUSTERER_TYPE == ClustererType.CLASSIC_OLD)
                // distances.set(x, y , dist);
                // else
                distancesGNF_Tri.setValue(x, y, dist);
            }
            processedRows++;
            if (DEBUG)
                logger.info("Treating lists: " + (Math.round(100 * processedRows / (double) nbPeaks)) + " %");
        }
    }
    if (DEBUG)
        printMemoryUsage(logger, run_time, prevTotal, prevFree, "DISTANCES COMPUTED");
    // ////
    // Math.abs(row.getBestPeak().getRT() -
    double max_dist = maximumScore;
    // k_row.getBestPeak().getRT()) /
    // ((RangeUtils.rangeLength(rtRange) /
    // 2.0));
    // String newickCluster;
    List<List<Integer>> gnfClusters = null;
    // ////
    boolean do_verbose = true;
    boolean do_cluster = true;
    boolean do_print = (exportDendrogramAsTxt);
    boolean do_data = false;
    org.gnf.clustering.Node[] arNodes = null;
    int nRowCount = full_rows_list.size();
    String[] rowNames = null;
    if (do_print) {
        rowNames = new String[nRowCount];
        for (int i = 0; i < nRowCount; i++) {
            // rowNames[i] = "ID_" + i + "_" +
            // full_rows_list.get(i).getID();
            Feature peak = full_rows_list.get(i).getBestPeak();
            double rt = peak.getRT();
            int end = peak.getDataFile().getName().indexOf(" ");
            String short_fname = peak.getDataFile().getName().substring(0, end);
            rowNames[i] = "@" + rtFormat.format(rt) + "^[" + short_fname + "]";
        }
    }
    String outputPrefix = null;
    if (CLUSTERER_TYPE == ClustererType.CLASSIC) {
        // Pure Hierar!
        outputPrefix = "hierar_0";
        throw new IllegalStateException("'" + ClustererType.CLASSIC.toString() + "' algorithm not yet implemented!");
    } else if (CLUSTERER_TYPE == ClustererType.CACHED) {
        // TODO: ...!
        if (DEBUG_2)
            logger.info(distancesGNF_Tri.toString());
        if (saveRAMratherThanCPU_2) {
            // Requires: distances values will be
            // recomputed on demand during
            // "getValidatedClusters_3()"
            // No duplicate backup storage!
            distancesGNF_Tri_Bkp = null;
        } else {
            // Otherwise, backing up the distance matrix (matrix being
            // deeply changed during "clusterDM()", then no more
            // exploitable)
            distancesGNF_Tri_Bkp = new DistanceMatrixTriangular1D2D(distancesGNF_Tri);
            if (DEBUG)
                printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER BACKUP MATRIX");
        }
        if (DEBUG)
            logger.info("Clustering...");
        if (distancesGNF_Tri != null)
            arNodes = org.gnf.clustering.sequentialcache.SequentialCacheClustering.clusterDM(distancesGNF_Tri, linkageStartegyType, null, nRowCount);
        distancesGNF_Tri = null;
        System.gc();
        if (DEBUG)
            printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER DONE");
        if (DEBUG_2)
            logger.info(distancesGNF_Tri.toString());
        if (DEBUG_2)
            for (int i = 0; i < arNodes.length; i++) {
                logger.info("Node " + i + ": " + arNodes[i]);
            }
        // TODO: Use usual interfacing ...
        // ClusteringResult<org.gnf.clustering.Node> clust_res = new
        // ClusteringResult<>(
        // Arrays.asList(arNodes), null, 0, null);
        outputPrefix = "hierar_1";
    } else if (CLUSTERER_TYPE == ClustererType.HYBRID) {
        throw new IllegalStateException("'" + ClustererType.HYBRID.toString() + "' algorithm not yet implemented!");
    }
    // Sort Nodes by correlation score (Required in
    // 'getValidatedClusters_3')
    int[] rowOrder = new int[nRowCount];
    if (DEBUG)
        logger.info("Sorting tree nodes...");
    org.gnf.clustering.Utils.NodeSort(arNodes, nRowCount - 2, 0, rowOrder);
    if (do_cluster) {
        gnfClusters = getValidatedClusters_3(arNodes, 0.0f, newIds.length, max_dist, distancesGNF_Tri_Bkp, distProvider);
        // -- Print
        if (DEBUG_2 && do_verbose)
            for (int i = 0; i < gnfClusters.size(); i++) {
                List<Integer> cl = gnfClusters.get(i);
                String str = "";
                for (int j = 0; j < cl.size(); j++) {
                    int r = cl.get(j);
                    str += cl.get(j) + "^(" + full_rows_list.get(r).getID() + ", " + full_rows_list.get(r).getAverageRT() + ")" + " ";
                }
                logger.info(str);
            }
    }
    // File output
    int ext_pos = dendrogramTxtFilename.getAbsolutePath().lastIndexOf(".");
    outputPrefix = dendrogramTxtFilename.getAbsolutePath().substring(0, ext_pos);
    String outGtr = outputPrefix + ".gtr";
    String outCdt = outputPrefix + ".cdt";
    if (DEBUG)
        logger.info("Writing output to file...");
    int nColCount = 1;
    String[] colNames = new String[nColCount];
    colNames[nColCount - 1] = "Id";
    String sep = "\t";
    if (do_print) {
        try {
            float[] arFloats = new float[nRowCount];
            for (int i = 0; i < arFloats.length; i++) {
                arFloats[i] = i / 2.0f;
            }
            DataSource source = (do_data) ? new FloatSource1D(arFloats, nRowCount, nColCount) : null;
            /* org.gnf.clustering.Utils. */
            HierarAlignerGCTask.GenerateCDT(outCdt, source, /* null */
            nRowCount, nColCount, sep, rowNames, colNames, rowOrder);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        org.gnf.clustering.Utils.WriteTreeToFile(outGtr, nRowCount - 1, arNodes, true);
        if (DEBUG)
            printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER FILES PRINTED");
    }
    // //// Arrange row clustered list with method 0,1,2
    List<List<PeakListRow>> clustersList = new ArrayList<>();
    // Build feature list row clusters
    for (List<Integer> cl : gnfClusters) {
        List<PeakListRow> rows_cluster = new ArrayList<>();
        for (int i = 0; i < cl.size(); i++) {
            rows_cluster.add(full_rows_list.get(cl.get(i)));
        }
        clustersList.add(rows_cluster);
        // 
        processedRows += rows_cluster.size();
    }
    if (DEBUG)
        printMemoryUsage(logger, run_time, prevTotal, prevFree, "GNF CLUSTERER CLUSTER_LIST");
    // Fill alignment table: One row per cluster
    for (List<PeakListRow> cluster : clustersList) {
        if (isCanceled())
            return;
        PeakListRow targetRow = new SimplePeakListRow(newRowID);
        newRowID++;
        alignedPeakList.addRow(targetRow);
        // 
        infoRowsBackup.put(targetRow, new Object[] { new HashMap<RawDataFile, Double[]>(), new HashMap<RawDataFile, PeakIdentity>(), new HashMap<RawDataFile, Double>() });
        for (PeakListRow row : cluster) {
            // Add all non-existing identities from the original row to the
            // aligned row
            // Set the preferred identity
            targetRow.setPreferredPeakIdentity(row.getPreferredPeakIdentity());
            // for (RawDataFile file : row.getRawDataFiles()) {
            for (RawDataFile file : alignedPeakList.getRawDataFiles()) {
                if (Arrays.asList(row.getRawDataFiles()).contains(file)) {
                    Feature originalPeak = row.getPeak(file);
                    if (originalPeak != null) {
                        targetRow.addPeak(file, originalPeak);
                    } else {
                        setStatus(TaskStatus.ERROR);
                        setErrorMessage("Cannot run alignment, no originalPeak");
                        return;
                    }
                }
            }
            // present
            for (PeakIdentity identity : row.getPeakIdentities()) {
                PeakIdentity clonedIdentity = (PeakIdentity) identity.clone();
                if (!PeakUtils.containsIdentity(targetRow, clonedIdentity))
                    targetRow.addPeakIdentity(clonedIdentity, false);
            }
        // processedRows++;
        }
    }
    // of the "targetRow.update()" used down there
    for (SimpleFeature peak : rtPeaksBackup.keySet()) {
        peak.setRT((double) rtPeaksBackup.get(peak));
    }
    /**
     * Post-processing... *
     */
    // Build reference RDFs index: We need an ordered reference here, to be
    // able to parse
    // correctly while reading back stored info
    RawDataFile[] rdf_sorted = alignedPeakList.getRawDataFiles().clone();
    Arrays.sort(rdf_sorted, new RawDataFileSorter(SortingDirection.Ascending));
    // Process
    for (PeakListRow targetRow : infoRowsBackup.keySet()) {
        if (isCanceled())
            return;
        // Refresh averaged RTs...
        ((SimplePeakListRow) targetRow).update();
    }
    // 
    if (DEBUG) {
        endTime = System.currentTimeMillis();
        ms = (endTime - startTime);
        logger.info("## >> Whole JoinAlignerGCTask processing took " + Float.toString(ms) + " ms.");
    }
    // ----------------------------------------------------------------------
    // Add new aligned feature list to the project
    this.project.addPeakList(alignedPeakList);
    if (DEBUG) {
        for (RawDataFile rdf : alignedPeakList.getRawDataFiles()) logger.info("RDF: " + rdf);
    }
    // Add task description to peakList
    alignedPeakList.addDescriptionOfAppliedTask(new SimplePeakListAppliedMethod(HierarAlignerGCTask.TASK_NAME, parameters));
    logger.info("Finished join aligner GC");
    setStatus(TaskStatus.FINISHED);
}
Also used : ArrayList(java.util.ArrayList) SimplePeakListAppliedMethod(net.sf.mzmine.datamodel.impl.SimplePeakListAppliedMethod) Feature(net.sf.mzmine.datamodel.Feature) SimpleFeature(net.sf.mzmine.datamodel.impl.SimpleFeature) SimplePeakListRow(net.sf.mzmine.datamodel.impl.SimplePeakListRow) SimplePeakList(net.sf.mzmine.datamodel.impl.SimplePeakList) PeakList(net.sf.mzmine.datamodel.PeakList) ArrayList(java.util.ArrayList) List(java.util.List) SimplePeakList(net.sf.mzmine.datamodel.impl.SimplePeakList) DistanceMatrix(org.gnf.clustering.DistanceMatrix) Vector(java.util.Vector) SimpleFeature(net.sf.mzmine.datamodel.impl.SimpleFeature) DataSource(org.gnf.clustering.DataSource) PeakIdentity(net.sf.mzmine.datamodel.PeakIdentity) RawDataFile(net.sf.mzmine.datamodel.RawDataFile) SimplePeakListRow(net.sf.mzmine.datamodel.impl.SimplePeakListRow) PeakListRow(net.sf.mzmine.datamodel.PeakListRow) FloatSource1D(org.gnf.clustering.FloatSource1D) Hashtable(java.util.Hashtable) IOException(java.io.IOException) SimplePeakList(net.sf.mzmine.datamodel.impl.SimplePeakList) PeakList(net.sf.mzmine.datamodel.PeakList)

Aggregations

IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Hashtable (java.util.Hashtable)1 List (java.util.List)1 Vector (java.util.Vector)1 Feature (net.sf.mzmine.datamodel.Feature)1 PeakIdentity (net.sf.mzmine.datamodel.PeakIdentity)1 PeakList (net.sf.mzmine.datamodel.PeakList)1 PeakListRow (net.sf.mzmine.datamodel.PeakListRow)1 RawDataFile (net.sf.mzmine.datamodel.RawDataFile)1 SimpleFeature (net.sf.mzmine.datamodel.impl.SimpleFeature)1 SimplePeakList (net.sf.mzmine.datamodel.impl.SimplePeakList)1 SimplePeakListAppliedMethod (net.sf.mzmine.datamodel.impl.SimplePeakListAppliedMethod)1 SimplePeakListRow (net.sf.mzmine.datamodel.impl.SimplePeakListRow)1 DataSource (org.gnf.clustering.DataSource)1 DistanceMatrix (org.gnf.clustering.DistanceMatrix)1 FloatSource1D (org.gnf.clustering.FloatSource1D)1