Search in sources :

Example 21 with LocalFile

use of ubic.gemma.model.common.description.LocalFile in project Gemma by PavlidisLab.

the class BatchInfoParser method matchBioAssaysToRawDataFiles.

/**
 * From the file names, match to the bioassays. GEO names things consistently (??) so this should work but not
 * ideal.
 *
 * @param files           files
 * @param assayAccessions accessions
 * @return map
 */
// Better readability
@SuppressWarnings("StatementWithEmptyBody")
private Map<BioAssay, File> matchBioAssaysToRawDataFiles(Collection<LocalFile> files, Map<String, BioAssay> assayAccessions) {
    Pattern regex = Pattern.compile("(GSM[0-9]+).+");
    Map<BioAssay, File> bioAssays2Files = new HashMap<>();
    for (LocalFile file : files) {
        File f = file.asFile();
        String n = f.getName();
        /*
             * We only support the newer style of storing these.
             */
        if (!n.startsWith("GSM")) {
            continue;
        }
        if (n.toUpperCase().contains(".CHP") || n.toUpperCase().contains(".DAT") || n.toUpperCase().contains(".EXP") || n.toUpperCase().contains(".RPT") || n.toUpperCase().contains(".TIF")) {
            continue;
        }
        /*
             * keep just the GSMNNNNNN part. FIXME: only works with GEO
             */
        Matcher matcher = regex.matcher(n);
        if (!matcher.matches()) {
            continue;
        }
        String acc = matcher.group(1);
        assert acc.matches("GSM[0-9]+");
        BioAssay ba = assayAccessions.get(acc);
        if (ba == null) {
            /*
                 * Warn? Throw exception?
                 */
            continue;
        }
        if (bioAssays2Files.containsKey(ba)) {
            /*
                 * Don't clobber a valid file. For affymetrix, CEL is what we want. Other cases harder to predict, but
                 * .txt files can be either good or bad. (We could do this check earlier)
                 */
            if (bioAssays2Files.get(ba).getName().toUpperCase().contains(".CEL")) {
                BatchInfoParser.log.debug("Retaining CEL file, ignoring " + f.getName());
                continue;
            } else if (f.getName().toUpperCase().contains(".CEL")) {
            // we displace the old file with this CEL file, but there is no need to warn.
            } else {
                BatchInfoParser.log.warn("Multiple files matching " + ba + ": " + bioAssays2Files.get(ba) + "; using new file: " + f);
            }
        }
        bioAssays2Files.put(ba, f);
    }
    return bioAssays2Files;
}
Also used : Pattern(java.util.regex.Pattern) LocalFile(ubic.gemma.model.common.description.LocalFile) Matcher(java.util.regex.Matcher) File(java.io.File) LocalFile(ubic.gemma.model.common.description.LocalFile) BioAssay(ubic.gemma.model.expression.bioAssay.BioAssay)

Example 22 with LocalFile

use of ubic.gemma.model.common.description.LocalFile in project Gemma by PavlidisLab.

the class DataUpdater method reprocessAffyThreePrimeArrayData.

/**
 * @param ee ee
 * @return This replaces the existing raw data with the CEL file data. CEL file(s) must be found by configuration
 */
// Possible external use
@SuppressWarnings("UnusedReturnValue")
public ExpressionExperiment reprocessAffyThreePrimeArrayData(ExpressionExperiment ee) {
    Collection<ArrayDesign> arrayDesignsUsed = this.experimentService.getArrayDesignsUsed(ee);
    ee = experimentService.thawLite(ee);
    RawDataFetcher f = new RawDataFetcher();
    Collection<LocalFile> files = f.fetch(ee.getAccession().getAccession());
    if (files.isEmpty()) {
        throw new RuntimeException("Data was apparently not available");
    }
    Collection<RawExpressionDataVector> vectors = new HashSet<>();
    // Use the same QT for each one
    QuantitationType qt = AffyPowerToolsProbesetSummarize.makeAffyQuantitationType();
    qt = quantitationTypeService.create(qt);
    for (ArrayDesign ad : arrayDesignsUsed) {
        DataUpdater.log.info("Processing data for " + ad);
        String cdfFileName = this.findCdf(ad).getAbsolutePath();
        ad = arrayDesignService.thaw(ad);
        AffyPowerToolsProbesetSummarize apt = new AffyPowerToolsProbesetSummarize(qt);
        vectors.addAll(apt.processThreeprimeArrayData(ee, cdfFileName, ad, files));
    }
    if (vectors.isEmpty()) {
        throw new IllegalStateException("No vectors were returned for " + ee);
    }
    ee = experimentService.replaceRawVectors(ee, vectors);
    this.audit(ee, "Data vector computation from CEL files using AffyPowerTools for " + StringUtils.join(arrayDesignsUsed, "; "), true);
    if (arrayDesignsUsed.size() == 1) {
        this.postprocess(ee);
    } else {
        DataUpdater.log.warn("Skipping postprocessing for mult-platform experiment");
    }
    return ee;
}
Also used : ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) AffyPowerToolsProbesetSummarize(ubic.gemma.core.loader.expression.AffyPowerToolsProbesetSummarize) LocalFile(ubic.gemma.model.common.description.LocalFile) RawExpressionDataVector(ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector) RawDataFetcher(ubic.gemma.core.loader.expression.geo.fetcher.RawDataFetcher)

Example 23 with LocalFile

use of ubic.gemma.model.common.description.LocalFile in project Gemma by PavlidisLab.

the class GeoDomainObjectGenerator method processPlatform.

private GeoPlatform processPlatform(String geoAccession) {
    assert platformFetcher != null;
    Collection<LocalFile> platforms = platformFetcher.fetch(geoAccession);
    if (platforms == null) {
        throw new RuntimeException("No series file found for " + geoAccession);
    }
    LocalFile platformFile = (platforms.iterator()).next();
    String platformPath;
    platformPath = platformFile.getLocalURL().getPath();
    parser.setProcessPlatformsOnly(true);
    try {
        parser.parse(platformPath);
    } catch (IOException e1) {
        throw new RuntimeException(e1);
    }
    return ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().get(geoAccession);
}
Also used : LocalFile(ubic.gemma.model.common.description.LocalFile) IOException(java.io.IOException)

Example 24 with LocalFile

use of ubic.gemma.model.common.description.LocalFile in project Gemma by PavlidisLab.

the class GeoDomainObjectGenerator method fetchDataSetToLocalFile.

private String fetchDataSetToLocalFile(String geoDataSetAccession) {
    Collection<LocalFile> result = datasetFetcher.fetch(geoDataSetAccession);
    if (result == null)
        return null;
    if (result.size() != 1) {
        throw new IllegalStateException("Got " + result.size() + " files for " + geoDataSetAccession + ", expected only one.");
    }
    LocalFile dataSetFile = (result.iterator()).next();
    String dataSetPath;
    dataSetPath = dataSetFile.getLocalURL().getPath();
    return dataSetPath;
}
Also used : LocalFile(ubic.gemma.model.common.description.LocalFile)

Example 25 with LocalFile

use of ubic.gemma.model.common.description.LocalFile in project Gemma by PavlidisLab.

the class GeoDomainObjectGenerator method processSeriesPlatforms.

/**
 * @param seriesAccession series accession
 */
private Collection<GeoPlatform> processSeriesPlatforms(String seriesAccession) {
    Collection<LocalFile> fullSeries = seriesFetcher.fetch(seriesAccession);
    if (fullSeries == null) {
        throw new RuntimeException("No series file found for " + seriesAccession);
    }
    LocalFile seriesFile = (fullSeries.iterator()).next();
    String seriesPath;
    seriesPath = seriesFile.getLocalURL().getPath();
    parser.setProcessPlatformsOnly(this.processPlatformsOnly);
    try {
        parser.parse(seriesPath);
    } catch (IOException e1) {
        throw new RuntimeException(e1);
    }
    return ((GeoParseResult) parser.getResults().iterator().next()).getPlatformMap().values();
}
Also used : LocalFile(ubic.gemma.model.common.description.LocalFile) IOException(java.io.IOException)

Aggregations

LocalFile (ubic.gemma.model.common.description.LocalFile)40 File (java.io.File)17 IOException (java.io.IOException)11 MalformedURLException (java.net.MalformedURLException)4 HashSet (java.util.HashSet)4 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)4 BioAssay (ubic.gemma.model.expression.bioAssay.BioAssay)4 URL (java.net.URL)3 SimpleDateFormat (java.text.SimpleDateFormat)3 Date (java.util.Date)3 BioMaterial (ubic.gemma.model.expression.biomaterial.BioMaterial)3 Taxon (ubic.gemma.model.genome.Taxon)3 StopWatch (org.apache.commons.lang3.time.StopWatch)2 AffyPowerToolsProbesetSummarize (ubic.gemma.core.loader.expression.AffyPowerToolsProbesetSummarize)2 RawDataFetcher (ubic.gemma.core.loader.expression.geo.fetcher.RawDataFetcher)2 HttpFetcher (ubic.gemma.core.loader.util.fetcher.HttpFetcher)2 RawExpressionDataVector (ubic.gemma.model.expression.bioAssayData.RawExpressionDataVector)2 FileNotFoundException (java.io.FileNotFoundException)1 InputStream (java.io.InputStream)1 URISyntaxException (java.net.URISyntaxException)1