Search in sources :

Example 71 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignSequenceProcessingServiceImpl method findOrUpdateSequences.

/**
 * Copy sequences into the original versions, or create new sequences in the DB, as needed.
 *
 * @param force If true, if an existing BioSequence that matches if found in the system, any existing sequence
 *              information in the BioSequence will be overwritten.
 * @return Items that were found.
 */
private Map<String, BioSequence> findOrUpdateSequences(Collection<String> accessionsToFetch, Collection<BioSequence> retrievedSequences, Taxon taxon, boolean force) {
    Map<String, BioSequence> found = new HashMap<>();
    for (BioSequence sequence : retrievedSequences) {
        if (ArrayDesignSequenceProcessingServiceImpl.log.isDebugEnabled())
            ArrayDesignSequenceProcessingServiceImpl.log.debug("Processing retrieved sequence: " + sequence);
        sequence.setTaxon(taxon);
        sequence = this.createOrUpdateGenbankSequence(sequence, force);
        String accession = sequence.getSequenceDatabaseEntry().getAccession();
        found.put(accession, sequence);
        accessionsToFetch.remove(accession);
    }
    return found;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 72 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class IlluminaProbeReader method parseOneLine.

@Override
public Reporter parseOneLine(String line) {
    String[] sArray = line.split("\t");
    if (sArray.length == 0)
        throw new IllegalArgumentException("Line format is not valid");
    String probeId = sArray[0];
    if (probeId == null || probeId.length() == 0)
        throw new IllegalArgumentException("Probe id invalid");
    if (probeId.startsWith("Search"))
        return null;
    if (sArray.length < 10)
        throw new IllegalArgumentException("Line format is not valid");
    String sequence = sArray[9];
    if (sequence == null || sequence.length() == 0)
        throw new IllegalArgumentException("Sequence is invalid");
    Reporter ap = Reporter.Factory.newInstance();
    BioSequence immobChar = BioSequence.Factory.newInstance();
    immobChar.setSequence(sequence);
    ap.setName(probeId);
    ap.setImmobilizedCharacteristic(immobChar);
    return ap;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 73 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class GeoConverterImpl method createMinimalBioSequence.

/**
 * @param taxon Can be null, we will discard this
 */
private BioSequence createMinimalBioSequence(Taxon taxon) {
    BioSequence bs = BioSequence.Factory.newInstance();
    bs.setTaxon(taxon);
    bs.setPolymerType(PolymerType.DNA);
    bs.setType(SequenceType.DNA);
    return bs;
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence)

Example 74 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class AffyProbeReader method parseOneLine.

@Override
public Collection<Reporter> parseOneLine(String line) {
    if (StringUtils.isEmpty(line)) {
        return null;
    }
    String[] sArray = line.split("\t");
    if (sArray.length == 0)
        throw new IllegalArgumentException("Line format is not valid (not tab-delimited or no fields found)");
    String probeSetId = sArray[0];
    if (probeSetId.startsWith("Probe")) {
        return null;
    }
    if (sArray.length < sequenceField + 1) {
        throw new IllegalArgumentException("Too few fields in line, expected at least " + (sequenceField + 1) + " but got " + sArray.length);
    }
    String sequence = sArray[sequenceField];
    if (StringUtils.isBlank(sequence)) {
        log.warn("No sequence");
    }
    String xcoord;
    String ycoord;
    String startInSequence;
    String index = null;
    if (sequenceField == 4) {
        xcoord = sArray[1];
        ycoord = sArray[2];
        startInSequence = sArray[3];
    } else {
        index = sArray[1];
        xcoord = sArray[2];
        ycoord = sArray[3];
        startInSequence = sArray[sequenceField - 1];
    }
    Reporter reporter = Reporter.Factory.newInstance();
    try {
        reporter.setRow(Integer.parseInt(xcoord));
        reporter.setCol(Integer.parseInt(ycoord));
    } catch (NumberFormatException e) {
        log.warn("Invalid row: could not parse coordinates: " + xcoord + ", " + ycoord);
        return null;
    }
    try {
        reporter.setStartInBioChar(Long.parseLong(startInSequence));
    } catch (NumberFormatException e) {
        if (startInSequence.equals("---")) {
            /*
                 * Controls have no start/end information. We really have to bail on these.
                 */
            log.debug("Control sequence");
        } else {
            log.warn("Invalid row: could not parse start in sequence: " + startInSequence);
        }
        return null;
    }
    String reporterName = probeSetId + (index == null ? "" : "#" + index) + ":" + xcoord + ":" + ycoord;
    reporter.setName(reporterName);
    BioSequence immobChar = BioSequence.Factory.newInstance();
    immobChar.setSequence(sequence);
    immobChar.setIsApproximateLength(false);
    immobChar.setLength((long) sequence.length());
    immobChar.setType(SequenceType.AFFY_PROBE);
    immobChar.setPolymerType(PolymerType.DNA);
    reporter.setImmobilizedCharacteristic(immobChar);
    CompositeSequence probeSet = CompositeSequence.Factory.newInstance();
    probeSet.setName(probeSetId);
    if (!reporterMap.containsKey(probeSet)) {
        reporterMap.put(probeSet, new HashSet<Reporter>());
    }
    reporter.setCompositeSequence(probeSet);
    reporterMap.get(probeSet).add(reporter);
    return reporterMap.get(probeSet);
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) CompositeSequence(ubic.gemma.model.expression.designElement.CompositeSequence)

Example 75 with BioSequence

use of ubic.gemma.model.genome.biosequence.BioSequence in project Gemma by PavlidisLab.

the class ArrayDesignMergeServiceImpl method merge.

@Override
public ArrayDesign merge(ArrayDesign arrayDesign, Collection<ArrayDesign> otherArrayDesigns, String nameOfNewDesign, String shortNameOfNewDesign, boolean add) {
    if (otherArrayDesigns.isEmpty())
        throw new IllegalArgumentException("Must merge at least one array design");
    /*
         * We allow merging of, or into, an already merged design, but array designs can't be merged into more than one.
         */
    if (arrayDesign.getMergedInto() != null) {
        throw new IllegalArgumentException("Sorry, can't merge an array design that is already a mergee (" + arrayDesign + ")");
    }
    if (add && arrayDesign.getMergees().isEmpty()) {
        throw new IllegalArgumentException("Can't use 'add' when arrayDesign isn't already a mergee (" + arrayDesign + ")");
    }
    // make map of biosequence -> design elements for all the array designs. But watch out for biosequences that
    // appear more than once per array design.
    Map<BioSequence, Collection<CompositeSequence>> globalBsMap = new HashMap<>();
    ArrayDesign thawed = this.makeBioSeqMap(globalBsMap, arrayDesign);
    ArrayDesignMergeServiceImpl.log.info(globalBsMap.keySet().size() + " sequences in first array design.");
    // Now check the other designs, add slots for additional probes if necessary.
    Collection<ArrayDesign> thawedOthers = new HashSet<>();
    for (ArrayDesign otherArrayDesign : otherArrayDesigns) {
        if (otherArrayDesign.getMergedInto() != null) {
            throw new IllegalArgumentException("Sorry, can't merge an array design that is already a mergee (" + otherArrayDesign + ")");
        }
        if (arrayDesign.equals(otherArrayDesign)) {
            // defensive.
            continue;
        }
        ArrayDesignMergeServiceImpl.log.info("Examining " + otherArrayDesign);
        thawedOthers.add(this.makeBioSeqMap(globalBsMap, otherArrayDesign));
        ArrayDesignMergeServiceImpl.log.info(globalBsMap.keySet().size() + " unique sequences encountered in total so far");
    }
    return this.createMerged(thawed, thawedOthers, globalBsMap, nameOfNewDesign, shortNameOfNewDesign, add);
}
Also used : BioSequence(ubic.gemma.model.genome.biosequence.BioSequence) HashMap(java.util.HashMap) ArrayDesign(ubic.gemma.model.expression.arrayDesign.ArrayDesign) Collection(java.util.Collection) HashSet(java.util.HashSet)

Aggregations

BioSequence (ubic.gemma.model.genome.biosequence.BioSequence)105 CompositeSequence (ubic.gemma.model.expression.designElement.CompositeSequence)40 ArrayDesign (ubic.gemma.model.expression.arrayDesign.ArrayDesign)24 Test (org.junit.Test)18 HashSet (java.util.HashSet)17 Taxon (ubic.gemma.model.genome.Taxon)15 BlatResult (ubic.gemma.model.genome.sequenceAnalysis.BlatResult)12 InputStream (java.io.InputStream)11 Collection (java.util.Collection)11 HashMap (java.util.HashMap)10 BaseSpringContextTest (ubic.gemma.core.testing.BaseSpringContextTest)10 GZIPInputStream (java.util.zip.GZIPInputStream)7 Gene (ubic.gemma.model.genome.Gene)7 GeoPlatform (ubic.gemma.core.loader.expression.geo.model.GeoPlatform)6 DatabaseEntry (ubic.gemma.model.common.description.DatabaseEntry)6 StopWatch (org.apache.commons.lang3.time.StopWatch)5 GeneProduct (ubic.gemma.model.genome.gene.GeneProduct)5 BioSequenceValueObject (ubic.gemma.model.genome.sequenceAnalysis.BioSequenceValueObject)5 BlatAssociation (ubic.gemma.model.genome.sequenceAnalysis.BlatAssociation)5 AbstractGeoServiceTest (ubic.gemma.core.loader.expression.geo.AbstractGeoServiceTest)4