Search in sources :

Example 1 with FileFormatException

use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.

the class BiomartEnsembleNcbiParser method parseOneLine.

/**
 * Method to parse one biomart line, note that there is a many to many relationship between ensemble ids and entrez
 * gene ids.
 *
 * @return BioMartEnsembleNcbi Value object representing the line parsed
 */
@Override
public Ensembl2NcbiValueObject parseOneLine(String line) {
    int bioMartFieldsPerRow = this.getBioMartFieldsPerRow();
    // header line from the bioMart headers then ignore it
    if (line.startsWith(this.bioMartHeaderFields[0]) || line.isEmpty()) {
        return null;
    }
    // split the line into the attributes
    String[] fields = StringUtils.splitPreserveAllTokens(line, BiomartEnsembleNcbiParser.FIELD_DELIM);
    // validate that correct format
    if (fields.length != bioMartFieldsPerRow) {
        /*
             * I think we should just continue on. Previous behaviour was to throw an exception.
             */
        return null;
    }
    // create the object
    try {
        return this.createBioMartEnsembleNcbi(fields);
    } catch (NumberFormatException e) {
        throw new FileFormatException(e);
    } catch (FileFormatException e) {
        throw new RuntimeException(e);
    }
}
Also used : FileFormatException(ubic.gemma.core.loader.util.parser.FileFormatException)

Example 2 with FileFormatException

use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.

the class BiomartEnsembleNcbiParser method createBioMartEnsembleNcbi.

/**
 * Given an array of strings representing the line to parse then create a BioMartEnsembleNcbi value object with some
 * validation. That is if a duplicate record keyed on peptide id is found then that means that it maps to more than
 * one entrez gene id. As such check that the duplicate and currently processed record share the same ensemble gene
 * id as a sanity check. Add the entrez gene to the existing collection of entrez genes.
 *
 * @param fields Parsed line split on delimiter
 * @return BioMartEnsembleNcbi value object
 * @throws NumberFormatException Parsing a number that is not one
 * @throws FileFormatException   Validation than when a duplicate record is found then the peptide id is the same the
 *                               ensemble gene id should be the same.
 */
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public Ensembl2NcbiValueObject createBioMartEnsembleNcbi(String[] fields) throws NumberFormatException, FileFormatException {
    Ensembl2NcbiValueObject bioMartEnsembleNcbi = new Ensembl2NcbiValueObject();
    String entrezGene = fields[2].trim();
    String ensemblProteinId = fields[3].trim();
    if (StringUtils.isBlank(ensemblProteinId)) {
        if (log.isDebugEnabled())
            log.debug("Blank protein id for line: " + StringUtils.join(fields, " "));
        return null;
    }
    // if there is no entrezgene skip as that is what we want
    if (StringUtils.isBlank(entrezGene)) {
        log.debug(ensemblProteinId + " has no entrez gene mapping");
        return null;
    }
    String ensemblGeneID = fields[0].trim();
    bioMartEnsembleNcbi.setNcbiTaxonId(taxon.getNcbiId());
    bioMartEnsembleNcbi.setEnsemblGeneId(ensemblGeneID);
    bioMartEnsembleNcbi.setEnsemblTranscriptId(fields[1]);
    bioMartEnsembleNcbi.setEnsemblPeptideId(ensemblProteinId);
    if (!bioMartHeaderFields[4].isEmpty() && fields[4] != null) {
        // only humans should have this field
        bioMartEnsembleNcbi.setHgnc_id(fields[4]);
    }
    // Ensembl ids can map to multiple entrez genes so we maintain a collection of entrezgenes
    if (!this.containsKey(ensemblProteinId)) {
        bioMartEnsembleNcbi.getEntrezgenes().add(entrezGene);
        results.put(ensemblProteinId, bioMartEnsembleNcbi);
        if (log.isDebugEnabled())
            log.debug(ensemblProteinId + " has no existing  entrez gene mapping");
    } else {
        Ensembl2NcbiValueObject bioMartEnsembleNcbiDup = this.get(ensemblProteinId);
        // check that the this duplicate record also is the same for ensembl id
        if (ensemblGeneID.equals(bioMartEnsembleNcbiDup.getEnsemblGeneId())) {
            this.get(ensemblProteinId).getEntrezgenes().add(entrezGene);
            if (log.isDebugEnabled())
                log.debug(ensemblProteinId + "added gene to duplicate  ");
        } else {
            throw new FileFormatException("A duplicate ensemblProteinId has been found: " + ensemblProteinId + " but it does not match with the exisiting objects gene id " + ensemblGeneID + ", it was " + bioMartEnsembleNcbiDup.getEnsemblGeneId() + ", line was:\n" + StringUtils.join(fields, " "));
        }
    }
    return bioMartEnsembleNcbi;
}
Also used : Ensembl2NcbiValueObject(ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject) FileFormatException(ubic.gemma.core.loader.util.parser.FileFormatException)

Example 3 with FileFormatException

use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.

the class NcbiGeneInfoParser method parseOneLine.

@Override
public NCBIGeneInfo parseOneLine(String line) {
    String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
    if (fields.length != NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW) {
        // noinspection StatementWithEmptyBody // backwards compatibility, old format, hopefully okay
        if (fields.length == 13 || fields.length == 14 || fields.length == 15) {
        // They keep adding fields at the end...we only need the first few.
        } else {
            throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length + " fields, expected " + NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW);
        }
    }
    NCBIGeneInfo geneInfo = new NCBIGeneInfo();
    try {
        // Skip taxa that we don't support.
        int taxonId = Integer.parseInt(fields[0]);
        if (filter && ncbiTaxonIds != null) {
            if (!ncbiTaxonIds.contains(taxonId)) {
                return null;
            }
        }
        // See ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README
        // #Format:
        // tax_id
        // GeneID
        // Symbol
        // LocusTag
        // Synonyms
        // dbXrefs, separated by |
        // chromosome
        // map_location
        // description
        // type_of_gene
        // Symbol_from_nomenclature_authority
        // Full_name_from_nomenclature_authority
        // Nomenclature_status
        // Other_designations
        // Modification_date
        // Feature type
        geneInfo.setTaxId(taxonId);
        geneInfo.setGeneId(fields[1]);
        geneInfo.setDefaultSymbol(fields[2]);
        geneInfo.setLocusTag(fields[3]);
        String[] synonyms = StringUtils.splitPreserveAllTokens(fields[4], '|');
        for (String synonym : synonyms) {
            if (synonym.equals("-"))
                continue;
            geneInfo.addToSynonyms(synonym);
        }
        if (!fields[5].equals("-")) {
            String[] dbXRefs = StringUtils.splitPreserveAllTokens(fields[5], '|');
            for (String dbXr : dbXRefs) {
                String[] dbF = StringUtils.split(dbXr, ':');
                if (dbF.length != 2) {
                    /*
                         * Annoyingly, HGCN identifiers now have the format HGNC:X where X is an integer. This is
                         * apparent from downloading files from HGCN (http://www.genenames.org/cgi-bin/statistics). Same
                         * situation for MGI
                         *
                         * Therefore we have a special case.
                         */
                    if (dbF.length == 3 && (dbF[1].equals("HGNC") || dbF[1].equals("MGI"))) {
                        dbF[1] = dbF[1] + ":" + dbF[2];
                    } else {
                        // we're very stringent to avoid data corruption.
                        throw new FileFormatException("Expected 2 fields, got " + dbF.length + " from '" + dbXr + "'");
                    }
                }
                geneInfo.addToDbXRefs(dbF[0], dbF[1]);
            }
        }
        geneInfo.setChromosome(fields[6]);
        geneInfo.setMapLocation(fields[7]);
        geneInfo.setDescription(fields[8]);
        geneInfo.setGeneType(NCBIGeneInfo.typeStringToGeneType(fields[9]));
        geneInfo.setSymbolIsFromAuthority(!fields[10].equals("-"));
        geneInfo.setNameIsFromAuthority(!fields[11].equals("-"));
        geneInfo.setNomenclatureStatus(fields[12].equals("-") ? NomenclatureStatus.UNKNOWN : fields[11].equals("O") ? NomenclatureStatus.OFFICIAL : NomenclatureStatus.INTERIM);
    // ignore 14th field for now - it stores alternate protein names
    // ignore 15th, modification date
    } catch (NumberFormatException e) {
        throw new FileFormatException(e);
    }
    return geneInfo;
}
Also used : NCBIGeneInfo(ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo) FileFormatException(ubic.gemma.core.loader.util.parser.FileFormatException)

Example 4 with FileFormatException

use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.

the class StringProteinProteinInteractionFileParser method createStringProteinProteinInteraction.

/**
 * Typical line of string file is of the following format:
 * <pre>
 * 882.DVU0001 882.DVU0002 707 0 0 0 0 0 172 742
 * </pre>
 * 882.DVU0001 and 882.DVU0002 refer to protein 1 and protein2 Note the 882 is the ncbi taxon id, the other part is
 * an external id (ensembl). Method takes the array representing a line of string file and creates a
 * StringProteinProteinInteraction object.
 *
 * @param fields Line split on delimiter
 * @return StringProteinProteinInteraction value object.
 */
public StringProteinProteinInteraction createStringProteinProteinInteraction(String[] fields) {
    // validate
    if (fields == null) {
        return null;
    }
    if (fields[0] == null || fields[1] == null || fields[0].isEmpty() || fields[1].isEmpty()) {
        return null;
    }
    String[] protein1AndTaxa = StringUtils.split(fields[0], ".");
    int taxonIdProtein1 = Integer.parseInt(protein1AndTaxa[0]);
    String[] protein2AndTaxa = StringUtils.split(fields[1], ".");
    int taxonIdProtein2 = Integer.parseInt(protein2AndTaxa[0]);
    // Check that the two proteins taxa match that is the taxon appended to protein name match
    if (taxonIdProtein1 != taxonIdProtein2) {
        throw new FileFormatException("Protein 1 " + fields[0] + " protein 2  " + fields[1] + " do not contain matching taxons");
    }
    // taxon not supported skip it
    if (!(this.getNcbiValidTaxon()).contains(taxonIdProtein1)) {
        return null;
    }
    // always ensure that protein 1 and protein 2 are set same alphabetical order makes matching much easier later
    // hashcode equality method relies on them being in consistent order.
    // use hashcode as mixed alphanumeric code
    Integer protein1Infile = fields[0].hashCode();
    Integer protein2InFile = fields[1].hashCode();
    StringProteinProteinInteraction stringProteinProteinInteraction;
    if (protein1Infile.compareTo(protein2InFile) < 0) {
        stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[0], fields[1]);
    } else {
        stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[1], fields[0]);
    }
    stringProteinProteinInteraction.setNcbiTaxonId(taxonIdProtein1);
    // validate the line make sure these fields are numeric
    for (int i = 2; i < fields.length; i++) {
        if (!StringUtils.isNumeric(fields[i])) {
            throw new FileFormatException("This line does not contain valid number ");
        }
    }
    stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.NEIGHBORHOOD, Integer.valueOf(fields[2]));
    stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.GENEFUSION, Integer.valueOf(fields[3]));
    stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.COOCCURENCE, Integer.valueOf(fields[4]));
    stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.COEXPRESSION, Integer.valueOf(fields[5]));
    stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.EXPERIMENTAL, Integer.valueOf(fields[6]));
    stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.DATABASE, Integer.valueOf(fields[7]));
    stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.TEXTMINING, Integer.valueOf(fields[8]));
    stringProteinProteinInteraction.setCombined_score(Double.valueOf(fields[9]));
    return stringProteinProteinInteraction;
}
Also used : FileFormatException(ubic.gemma.core.loader.util.parser.FileFormatException) StringProteinProteinInteraction(ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction)

Aggregations

FileFormatException (ubic.gemma.core.loader.util.parser.FileFormatException)4 NCBIGeneInfo (ubic.gemma.core.loader.genome.gene.ncbi.model.NCBIGeneInfo)1 Ensembl2NcbiValueObject (ubic.gemma.core.loader.protein.biomart.model.Ensembl2NcbiValueObject)1 StringProteinProteinInteraction (ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction)1