use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.
the class BiomartEnsembleNcbiParser method parseOneLine.
/**
* Method to parse one biomart line, note that there is a many to many relationship between ensemble ids and entrez
* gene ids.
*
* @return BioMartEnsembleNcbi Value object representing the line parsed
*/
@Override
public Ensembl2NcbiValueObject parseOneLine(String line) {
int bioMartFieldsPerRow = this.getBioMartFieldsPerRow();
// header line from the bioMart headers then ignore it
if (line.startsWith(this.bioMartHeaderFields[0]) || line.isEmpty()) {
return null;
}
// split the line into the attributes
String[] fields = StringUtils.splitPreserveAllTokens(line, BiomartEnsembleNcbiParser.FIELD_DELIM);
// validate that correct format
if (fields.length != bioMartFieldsPerRow) {
/*
* I think we should just continue on. Previous behaviour was to throw an exception.
*/
return null;
}
// create the object
try {
return this.createBioMartEnsembleNcbi(fields);
} catch (NumberFormatException e) {
throw new FileFormatException(e);
} catch (FileFormatException e) {
throw new RuntimeException(e);
}
}
use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.
the class BiomartEnsembleNcbiParser method createBioMartEnsembleNcbi.
/**
* Given an array of strings representing the line to parse then create a BioMartEnsembleNcbi value object with some
* validation. That is if a duplicate record keyed on peptide id is found then that means that it maps to more than
* one entrez gene id. As such check that the duplicate and currently processed record share the same ensemble gene
* id as a sanity check. Add the entrez gene to the existing collection of entrez genes.
*
* @param fields Parsed line split on delimiter
* @return BioMartEnsembleNcbi value object
* @throws NumberFormatException Parsing a number that is not one
* @throws FileFormatException Validation than when a duplicate record is found then the peptide id is the same the
* ensemble gene id should be the same.
*/
// Possible external use
@SuppressWarnings({ "unused", "WeakerAccess" })
public Ensembl2NcbiValueObject createBioMartEnsembleNcbi(String[] fields) throws NumberFormatException, FileFormatException {
Ensembl2NcbiValueObject bioMartEnsembleNcbi = new Ensembl2NcbiValueObject();
String entrezGene = fields[2].trim();
String ensemblProteinId = fields[3].trim();
if (StringUtils.isBlank(ensemblProteinId)) {
if (log.isDebugEnabled())
log.debug("Blank protein id for line: " + StringUtils.join(fields, " "));
return null;
}
// if there is no entrezgene skip as that is what we want
if (StringUtils.isBlank(entrezGene)) {
log.debug(ensemblProteinId + " has no entrez gene mapping");
return null;
}
String ensemblGeneID = fields[0].trim();
bioMartEnsembleNcbi.setNcbiTaxonId(taxon.getNcbiId());
bioMartEnsembleNcbi.setEnsemblGeneId(ensemblGeneID);
bioMartEnsembleNcbi.setEnsemblTranscriptId(fields[1]);
bioMartEnsembleNcbi.setEnsemblPeptideId(ensemblProteinId);
if (!bioMartHeaderFields[4].isEmpty() && fields[4] != null) {
// only humans should have this field
bioMartEnsembleNcbi.setHgnc_id(fields[4]);
}
// Ensembl ids can map to multiple entrez genes so we maintain a collection of entrezgenes
if (!this.containsKey(ensemblProteinId)) {
bioMartEnsembleNcbi.getEntrezgenes().add(entrezGene);
results.put(ensemblProteinId, bioMartEnsembleNcbi);
if (log.isDebugEnabled())
log.debug(ensemblProteinId + " has no existing entrez gene mapping");
} else {
Ensembl2NcbiValueObject bioMartEnsembleNcbiDup = this.get(ensemblProteinId);
// check that the this duplicate record also is the same for ensembl id
if (ensemblGeneID.equals(bioMartEnsembleNcbiDup.getEnsemblGeneId())) {
this.get(ensemblProteinId).getEntrezgenes().add(entrezGene);
if (log.isDebugEnabled())
log.debug(ensemblProteinId + "added gene to duplicate ");
} else {
throw new FileFormatException("A duplicate ensemblProteinId has been found: " + ensemblProteinId + " but it does not match with the exisiting objects gene id " + ensemblGeneID + ", it was " + bioMartEnsembleNcbiDup.getEnsemblGeneId() + ", line was:\n" + StringUtils.join(fields, " "));
}
}
return bioMartEnsembleNcbi;
}
use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.
the class NcbiGeneInfoParser method parseOneLine.
@Override
public NCBIGeneInfo parseOneLine(String line) {
String[] fields = StringUtils.splitPreserveAllTokens(line, '\t');
if (fields.length != NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW) {
// noinspection StatementWithEmptyBody // backwards compatibility, old format, hopefully okay
if (fields.length == 13 || fields.length == 14 || fields.length == 15) {
// They keep adding fields at the end...we only need the first few.
} else {
throw new FileFormatException("Line + " + line + " is not in the right format: has " + fields.length + " fields, expected " + NcbiGeneInfoParser.NCBI_GENEINFO_FIELDS_PER_ROW);
}
}
NCBIGeneInfo geneInfo = new NCBIGeneInfo();
try {
// Skip taxa that we don't support.
int taxonId = Integer.parseInt(fields[0]);
if (filter && ncbiTaxonIds != null) {
if (!ncbiTaxonIds.contains(taxonId)) {
return null;
}
}
// See ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/README
// #Format:
// tax_id
// GeneID
// Symbol
// LocusTag
// Synonyms
// dbXrefs, separated by |
// chromosome
// map_location
// description
// type_of_gene
// Symbol_from_nomenclature_authority
// Full_name_from_nomenclature_authority
// Nomenclature_status
// Other_designations
// Modification_date
// Feature type
geneInfo.setTaxId(taxonId);
geneInfo.setGeneId(fields[1]);
geneInfo.setDefaultSymbol(fields[2]);
geneInfo.setLocusTag(fields[3]);
String[] synonyms = StringUtils.splitPreserveAllTokens(fields[4], '|');
for (String synonym : synonyms) {
if (synonym.equals("-"))
continue;
geneInfo.addToSynonyms(synonym);
}
if (!fields[5].equals("-")) {
String[] dbXRefs = StringUtils.splitPreserveAllTokens(fields[5], '|');
for (String dbXr : dbXRefs) {
String[] dbF = StringUtils.split(dbXr, ':');
if (dbF.length != 2) {
/*
* Annoyingly, HGCN identifiers now have the format HGNC:X where X is an integer. This is
* apparent from downloading files from HGCN (http://www.genenames.org/cgi-bin/statistics). Same
* situation for MGI
*
* Therefore we have a special case.
*/
if (dbF.length == 3 && (dbF[1].equals("HGNC") || dbF[1].equals("MGI"))) {
dbF[1] = dbF[1] + ":" + dbF[2];
} else {
// we're very stringent to avoid data corruption.
throw new FileFormatException("Expected 2 fields, got " + dbF.length + " from '" + dbXr + "'");
}
}
geneInfo.addToDbXRefs(dbF[0], dbF[1]);
}
}
geneInfo.setChromosome(fields[6]);
geneInfo.setMapLocation(fields[7]);
geneInfo.setDescription(fields[8]);
geneInfo.setGeneType(NCBIGeneInfo.typeStringToGeneType(fields[9]));
geneInfo.setSymbolIsFromAuthority(!fields[10].equals("-"));
geneInfo.setNameIsFromAuthority(!fields[11].equals("-"));
geneInfo.setNomenclatureStatus(fields[12].equals("-") ? NomenclatureStatus.UNKNOWN : fields[11].equals("O") ? NomenclatureStatus.OFFICIAL : NomenclatureStatus.INTERIM);
// ignore 14th field for now - it stores alternate protein names
// ignore 15th, modification date
} catch (NumberFormatException e) {
throw new FileFormatException(e);
}
return geneInfo;
}
use of ubic.gemma.core.loader.util.parser.FileFormatException in project Gemma by PavlidisLab.
the class StringProteinProteinInteractionFileParser method createStringProteinProteinInteraction.
/**
* Typical line of string file is of the following format:
* <pre>
* 882.DVU0001 882.DVU0002 707 0 0 0 0 0 172 742
* </pre>
* 882.DVU0001 and 882.DVU0002 refer to protein 1 and protein2 Note the 882 is the ncbi taxon id, the other part is
* an external id (ensembl). Method takes the array representing a line of string file and creates a
* StringProteinProteinInteraction object.
*
* @param fields Line split on delimiter
* @return StringProteinProteinInteraction value object.
*/
public StringProteinProteinInteraction createStringProteinProteinInteraction(String[] fields) {
// validate
if (fields == null) {
return null;
}
if (fields[0] == null || fields[1] == null || fields[0].isEmpty() || fields[1].isEmpty()) {
return null;
}
String[] protein1AndTaxa = StringUtils.split(fields[0], ".");
int taxonIdProtein1 = Integer.parseInt(protein1AndTaxa[0]);
String[] protein2AndTaxa = StringUtils.split(fields[1], ".");
int taxonIdProtein2 = Integer.parseInt(protein2AndTaxa[0]);
// Check that the two proteins taxa match that is the taxon appended to protein name match
if (taxonIdProtein1 != taxonIdProtein2) {
throw new FileFormatException("Protein 1 " + fields[0] + " protein 2 " + fields[1] + " do not contain matching taxons");
}
// taxon not supported skip it
if (!(this.getNcbiValidTaxon()).contains(taxonIdProtein1)) {
return null;
}
// always ensure that protein 1 and protein 2 are set same alphabetical order makes matching much easier later
// hashcode equality method relies on them being in consistent order.
// use hashcode as mixed alphanumeric code
Integer protein1Infile = fields[0].hashCode();
Integer protein2InFile = fields[1].hashCode();
StringProteinProteinInteraction stringProteinProteinInteraction;
if (protein1Infile.compareTo(protein2InFile) < 0) {
stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[0], fields[1]);
} else {
stringProteinProteinInteraction = new StringProteinProteinInteraction(fields[1], fields[0]);
}
stringProteinProteinInteraction.setNcbiTaxonId(taxonIdProtein1);
// validate the line make sure these fields are numeric
for (int i = 2; i < fields.length; i++) {
if (!StringUtils.isNumeric(fields[i])) {
throw new FileFormatException("This line does not contain valid number ");
}
}
stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.NEIGHBORHOOD, Integer.valueOf(fields[2]));
stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.GENEFUSION, Integer.valueOf(fields[3]));
stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.COOCCURENCE, Integer.valueOf(fields[4]));
stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.COEXPRESSION, Integer.valueOf(fields[5]));
stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.EXPERIMENTAL, Integer.valueOf(fields[6]));
stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.DATABASE, Integer.valueOf(fields[7]));
stringProteinProteinInteraction.addEvidenceCodeScoreToMap(StringProteinInteractionEvidenceCodeEnum.TEXTMINING, Integer.valueOf(fields[8]));
stringProteinProteinInteraction.setCombined_score(Double.valueOf(fields[9]));
return stringProteinProteinInteraction;
}
Aggregations