use of ubic.gemma.core.loader.expression.geo.model.GeoRecord in project Gemma by PavlidisLab.
the class GeoBrowser method getGeoRecordsBySearchTerm.
/**
* Performs an E-utilities query of the GEO database with the given searchTerms. Returns at most pageSize records
* (if found) starting at record #start.
*
* @param start start
* @param pageSize page size
* @param searchTerms search terms
* @return list of GeoRecords
* @throws IOException if there is a problem while manipulating the file
*/
public List<GeoRecord> getGeoRecordsBySearchTerm(String searchTerms, int start, int pageSize) throws IOException, RuntimeException {
List<GeoRecord> records = new ArrayList<>();
URL searchUrl = new URL(GeoBrowser.ESEARCH + searchTerms + "&retstart=" + start + "&retmax=" + pageSize + "&usehistory=y");
Document searchDocument;
URLConnection conn = searchUrl.openConnection();
conn.connect();
try (InputStream is = conn.getInputStream()) {
GeoBrowser.docFactory.setIgnoringComments(true);
GeoBrowser.docFactory.setValidating(false);
DocumentBuilder builder = GeoBrowser.docFactory.newDocumentBuilder();
searchDocument = builder.parse(is);
} catch (ParserConfigurationException | SAXException e) {
throw new RuntimeException(e);
}
NodeList countNode = searchDocument.getElementsByTagName("Count");
Node countEl = countNode.item(0);
int count;
try {
count = Integer.parseInt(XMLUtils.getTextValue((Element) countEl));
} catch (NumberFormatException e) {
throw new IOException("Could not parse count from: " + searchUrl);
}
if (count == 0)
throw new IOException("Got no records from: " + searchUrl);
NodeList qnode = searchDocument.getElementsByTagName("QueryKey");
Element queryIdEl = (Element) qnode.item(0);
NodeList cknode = searchDocument.getElementsByTagName("WebEnv");
Element cookieEl = (Element) cknode.item(0);
String queryId = XMLUtils.getTextValue(queryIdEl);
String cookie = XMLUtils.getTextValue(cookieEl);
URL fetchUrl = new URL(GeoBrowser.EFETCH + "&mode=mode.text" + "&query_key=" + queryId + "&retstart=" + start + "&retmax=" + pageSize + "&WebEnv=" + cookie);
conn = fetchUrl.openConnection();
conn.connect();
Document summaryDocument;
try (InputStream is = conn.getInputStream()) {
DocumentBuilder builder = GeoBrowser.docFactory.newDocumentBuilder();
summaryDocument = builder.parse(is);
XPathFactory xFactory = XPathFactory.newInstance();
XPath xpath = xFactory.newXPath();
// Get relevant data from the XML file
XPathExpression xaccession = xpath.compile("//DocSum/Item[@Name='GSE']");
XPathExpression xtitle = xpath.compile("//DocSum/Item[@Name='title']");
XPathExpression xnumSamples = xpath.compile("//DocSum/Item[@Name='n_samples']");
XPathExpression xreleaseDate = xpath.compile("//DocSum/Item[@Name='PDAT']");
XPathExpression xorganisms = xpath.compile("//DocSum/Item[@Name='taxon']");
Object accessions = xaccession.evaluate(summaryDocument, XPathConstants.NODESET);
NodeList accNodes = (NodeList) accessions;
Object titles = xtitle.evaluate(summaryDocument, XPathConstants.NODESET);
NodeList titleNodes = (NodeList) titles;
Object samples = xnumSamples.evaluate(summaryDocument, XPathConstants.NODESET);
NodeList sampleNodes = (NodeList) samples;
Object dates = xreleaseDate.evaluate(summaryDocument, XPathConstants.NODESET);
NodeList dateNodes = (NodeList) dates;
Object organisms = xorganisms.evaluate(summaryDocument, XPathConstants.NODESET);
NodeList orgnNodes = (NodeList) organisms;
// Create GeoRecords using information parsed from XML file
for (int i = 0; i < accNodes.getLength(); i++) {
GeoRecord record = new GeoRecord();
record.setGeoAccession("GSE" + accNodes.item(i).getTextContent());
record.setTitle(titleNodes.item(i).getTextContent());
record.setNumSamples(Integer.parseInt(sampleNodes.item(i).getTextContent()));
Date date = DateUtil.convertStringToDate("yyyy/MM/dd", dateNodes.item(i).getTextContent());
record.setReleaseDate(date);
record.setOrganisms(this.getTaxonCollection(orgnNodes.item(i).getTextContent()));
records.add(record);
}
if (records.isEmpty()) {
GeoBrowser.log.warn("No records obtained");
}
} catch (ParserConfigurationException | ParseException | XPathExpressionException | SAXException e) {
throw new IOException("Could not parse data: " + searchUrl, e);
}
return records;
}
use of ubic.gemma.core.loader.expression.geo.model.GeoRecord in project Gemma by PavlidisLab.
the class GeoBrowserServiceImpl method initLocalRecord.
private void initLocalRecord(String accession) {
assert localInfo != null;
if (!localInfo.containsKey(accession)) {
localInfo.put(accession, new GeoRecord());
localInfo.get(accession).setGeoAccession(accession);
}
}
use of ubic.gemma.core.loader.expression.geo.model.GeoRecord in project Gemma by PavlidisLab.
the class GeoGrabberCli method doWork.
@Override
protected Exception doWork(String[] args) {
Exception e = super.processCommandLine(args);
if (e != null)
return e;
Set<String> seen = new HashSet<>();
GeoBrowserService gbs = this.getBean(GeoBrowserService.class);
ExpressionExperimentService ees = this.getBean(ExpressionExperimentService.class);
try {
int start = 0;
int numfails = 0;
int chunksize = 100;
while (true) {
List<GeoRecord> recs = gbs.getRecentGeoRecords(start, chunksize);
if (recs.isEmpty()) {
AbstractCLI.log.info("No records received for start=" + start);
numfails++;
if (numfails > 10) {
AbstractCLI.log.info("Giving up");
break;
}
try {
Thread.sleep(500);
} catch (InterruptedException ignored) {
}
start++;
continue;
}
start++;
for (GeoRecord geoRecord : recs) {
if (seen.contains(geoRecord.getGeoAccession())) {
continue;
}
if (ees.findByShortName(geoRecord.getGeoAccession()) != null) {
continue;
}
if (!ees.findByAccession(geoRecord.getGeoAccession()).isEmpty()) {
continue;
}
System.out.println(geoRecord.getGeoAccession() + "\t" + geoRecord.getOrganisms().iterator().next() + "\t" + geoRecord.getNumSamples() + "\t" + geoRecord.getTitle() + "\t" + StringUtils.join(geoRecord.getCorrespondingExperiments(), ",") + "\t" + geoRecord.getSeriesType());
seen.add(geoRecord.getGeoAccession());
}
}
} catch (IOException | ParseException exception) {
return exception;
}
return null;
}
Aggregations