use of ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction in project Gemma by PavlidisLab.
the class StringProteinProteinInteractionObjectGenerator method generate.
/**
* Main method to generate StringProteinProteinInteraction objects.
*
* @param validTaxa Taxon to generate StringProteinProteinInteraction from string (STRING has many taxon).
* @return Collection of StringProteinProteinInteraction objects specific for the taxa that were provided, held in a
* may keyed on taxon.
*/
public Map<Taxon, Collection<StringProteinProteinInteraction>> generate(Collection<Taxon> validTaxa) {
log.debug("Starting to get StringProteinProteinInteraction data");
Collection<StringProteinProteinInteraction> stringProteinProteinInteractions;
if (stringProteinInteractionFileLocal == null) {
log.info("stringProteinInteractionFile is remote file fetching remote site");
fetchProteinStringFileFromRemoteSiteUnArchived();
}
Map<Taxon, Collection<StringProteinProteinInteraction>> map = new HashMap<>();
// however when I did it in one big go I got java.lang.OutOfMemoryError: Java heap space
for (Taxon taxon : validTaxa) {
log.info("calling taxon " + taxon);
Collection<Taxon> taxa = new ArrayList<>();
taxa.add(taxon);
stringProteinProteinInteractions = this.parseProteinStringFileInteraction(taxa);
map.put(taxon, stringProteinProteinInteractions);
}
log.debug("Starting to get StringProteinProteinInteraction data");
return map;
}
use of ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction in project Gemma by PavlidisLab.
the class StringBiomartProteinConverterTest method setUp.
@Before
public void setUp() {
String fileNameBiomartmouse = "/data/loader/protein/biomart/biomartmmusculusShort.txt";
URL fileNameBiomartmouseURL = this.getClass().getResource(fileNameBiomartmouse);
File taxonBiomartFile = new File(fileNameBiomartmouseURL.getFile());
Taxon taxon = Taxon.Factory.newInstance();
taxon.setIsGenesUsable(true);
taxon.setNcbiId(10090);
taxon.setScientificName("Mus musculus");
taxon.setIsSpecies(true);
taxa.add(taxon);
try {
BiomartEnsemblNcbiObjectGenerator biomartEnsemblNcbiObjectGenerator = new BiomartEnsemblNcbiObjectGenerator();
biomartEnsemblNcbiObjectGenerator.setBioMartFileName(taxonBiomartFile);
Map<String, Ensembl2NcbiValueObject> map = biomartEnsemblNcbiObjectGenerator.generate(taxa);
stringBiomartProteinConverter = new StringProteinProteinInteractionConverter(map);
} catch (Exception e) {
e.printStackTrace();
fail();
}
stringProteinProteinInteractionOne = new StringProteinProteinInteraction("ENSMUSP00000111623", "ENSMUSP00000100396");
StringProteinProteinInteraction stringProteinProteinInteractionTwo = new StringProteinProteinInteraction("ENSMUSP00000100395", "ENSMUSP00000100396");
StringProteinProteinInteraction stringProteinProteinInteractionThree = new StringProteinProteinInteraction("ENSMUSP00000100407", "ENSMUSP00000100395");
// add them to array
stringProteinProteinInteractions.add(stringProteinProteinInteractionOne);
stringProteinProteinInteractions.add(stringProteinProteinInteractionTwo);
stringProteinProteinInteractions.add(stringProteinProteinInteractionThree);
}
use of ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction in project Gemma by PavlidisLab.
the class StringProteinLinksDetailedParserTest method testParseFileContainingOneTaxon.
/*
* Test to ensure that a small file containing 100 lines can be parsed correctly. The file contains all one taxon.
* There are duplicate interactions in the file e.g. 10090.ENSMUSP00000000001 10090.ENSMUSP00000000153 0 0 0 0 0 900
* 27 902 10090.ENSMUSP00000000153 10090.ENSMUSP00000000001 0 0 0 0 0 900 27 902 are effectively the same and should
* be treated as one. protein.links.detailed.txt contains references but two are duplicates
*/
@Test
public void testParseFileContainingOneTaxon() {
String fileName = "/data/loader/protein/string/protein.links.detailed.txt";
URL myurl = this.getClass().getResource(fileName);
try {
parser.parse(new File(myurl.getFile()));
Collection<StringProteinProteinInteraction> items = parser.getResults();
assertEquals(23, items.size());
for (StringProteinProteinInteraction item : items) {
assertTrue(item.getProtein1().startsWith("10090.ENSMUSP"));
assertTrue(item.getProtein2().startsWith("10090.ENSMUSP"));
}
} catch (RuntimeException | IOException e) {
fail();
}
}
use of ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction in project Gemma by PavlidisLab.
the class StringProteinLinksDetailedParserTest method testParseOneValidLine.
/*
* Test to make sure that a line can be parsed correctly to its constituent values. Also that the alpabetical
* sorting of the protein works so that the most alpabetically higer value gets stored in protein 1. Make sure that
* the proteins get stored in the same order. Test method for
* {@link ubic.gemma.core.loader.protein.string.StringProteinProteinInteractionFileParser#parseOneLine(java.lang.String)}
*/
@Test
public void testParseOneValidLine() {
String line = "10090.ENSMUSP00000000201 10090.ENSMUSP00000000153 707 0 10 2 3 0 0 222";
StringProteinProteinInteraction stringProteinProteinInteraction = parser.parseOneLine(line);
assertTrue(stringProteinProteinInteraction.getNcbiTaxonId().equals(10090));
assertEquals("10090.ENSMUSP00000000153", stringProteinProteinInteraction.getProtein1());
assertEquals("10090.ENSMUSP00000000201", stringProteinProteinInteraction.getProtein2());
byte[] arrayStored = stringProteinProteinInteraction.getEvidenceVector();
byte[] array = new byte[] { 1, 0, 1, 1, 1, 0, 0 };
assertArrayEquals("Compare bit vector", array, arrayStored);
assertEquals(new Double(222), stringProteinProteinInteraction.getCombined_score());
}
use of ubic.gemma.core.loader.protein.string.model.StringProteinProteinInteraction in project Gemma by PavlidisLab.
the class StringProteinInteractionLoader method load.
/**
* Main method to load string protein protein interactions. Can either be supplied with files to load from or do
* remote download. After files have been located/fetched the files are parsed and converted into value objects.
* These value objects are then converted into GEMMA Gene2GeneProteinInteractions. Which are then loaded into the
* database. Can be run on all eligable TAXA in gemma or on a supplied taxon.
*
* @param stringProteinFileNameLocal The name of the string file on the local system
* @param stringProteinFileNameRemote The name of the string file on the remote system (just in case the string name
* proves to be too variable) - can be null
* @param localEnsembl2EntrezMappingFile The name of the local biomart file - can be null?
* @param taxa taxa to load data for. List of taxon to process
* @throws IOException io problems
*/
public void load(File stringProteinFileNameLocal, String stringProteinFileNameRemote, File localEnsembl2EntrezMappingFile, Collection<Taxon> taxa) throws IOException {
// very basic validation before any processing done
this.validateLoadParameters(stringProteinFileNameLocal, taxa);
// retrieve STRING protein protein interactions
StringProteinProteinInteractionObjectGenerator stringProteinProteinInteractionObjectGenerator = new StringProteinProteinInteractionObjectGenerator(stringProteinFileNameLocal, stringProteinFileNameRemote);
Map<Taxon, Collection<StringProteinProteinInteraction>> map = stringProteinProteinInteractionObjectGenerator.generate(taxa);
/*
* Get ENSEMBL to NCBI id mappings so we can store the STRING interactions
*/
Map<String, Ensembl2NcbiValueObject> bioMartStringEntreGeneMapping = this.getIdMappings(localEnsembl2EntrezMappingFile, taxa);
// To one taxon at a time to reduce memory use
for (Taxon taxon : map.keySet()) {
StringProteinInteractionLoader.log.debug("Loading for taxon " + taxon);
Collection<StringProteinProteinInteraction> proteinInteractions = map.get(taxon);
StringProteinInteractionLoader.log.info("Found " + proteinInteractions.size() + " STRING interactions for: " + taxon);
this.loadOneTaxonAtATime(bioMartStringEntreGeneMapping, proteinInteractions);
}
}
Aggregations