Search in sources :

Example 6 with GeoRecord

use of ubic.gemma.core.loader.expression.geo.model.GeoRecord in project Gemma by PavlidisLab.

the class GeoBrowser method getGeoRecordsBySearchTerm.

/**
 * Performs an E-utilities query of the GEO database with the given searchTerms. Returns at most pageSize records
 * (if found) starting at record #start.
 *
 * @param start       start
 * @param pageSize    page size
 * @param searchTerms search terms
 * @return list of GeoRecords
 * @throws IOException if there is a problem while manipulating the file
 */
public List<GeoRecord> getGeoRecordsBySearchTerm(String searchTerms, int start, int pageSize) throws IOException, RuntimeException {
    List<GeoRecord> records = new ArrayList<>();
    URL searchUrl = new URL(GeoBrowser.ESEARCH + searchTerms + "&retstart=" + start + "&retmax=" + pageSize + "&usehistory=y");
    Document searchDocument;
    URLConnection conn = searchUrl.openConnection();
    conn.connect();
    try (InputStream is = conn.getInputStream()) {
        GeoBrowser.docFactory.setIgnoringComments(true);
        GeoBrowser.docFactory.setValidating(false);
        DocumentBuilder builder = GeoBrowser.docFactory.newDocumentBuilder();
        searchDocument = builder.parse(is);
    } catch (ParserConfigurationException | SAXException e) {
        throw new RuntimeException(e);
    }
    NodeList countNode = searchDocument.getElementsByTagName("Count");
    Node countEl = countNode.item(0);
    int count;
    try {
        count = Integer.parseInt(XMLUtils.getTextValue((Element) countEl));
    } catch (NumberFormatException e) {
        throw new IOException("Could not parse count from: " + searchUrl);
    }
    if (count == 0)
        throw new IOException("Got no records from: " + searchUrl);
    NodeList qnode = searchDocument.getElementsByTagName("QueryKey");
    Element queryIdEl = (Element) qnode.item(0);
    NodeList cknode = searchDocument.getElementsByTagName("WebEnv");
    Element cookieEl = (Element) cknode.item(0);
    String queryId = XMLUtils.getTextValue(queryIdEl);
    String cookie = XMLUtils.getTextValue(cookieEl);
    URL fetchUrl = new URL(GeoBrowser.EFETCH + "&mode=mode.text" + "&query_key=" + queryId + "&retstart=" + start + "&retmax=" + pageSize + "&WebEnv=" + cookie);
    conn = fetchUrl.openConnection();
    conn.connect();
    Document summaryDocument;
    try (InputStream is = conn.getInputStream()) {
        DocumentBuilder builder = GeoBrowser.docFactory.newDocumentBuilder();
        summaryDocument = builder.parse(is);
        XPathFactory xFactory = XPathFactory.newInstance();
        XPath xpath = xFactory.newXPath();
        // Get relevant data from the XML file
        XPathExpression xaccession = xpath.compile("//DocSum/Item[@Name='GSE']");
        XPathExpression xtitle = xpath.compile("//DocSum/Item[@Name='title']");
        XPathExpression xnumSamples = xpath.compile("//DocSum/Item[@Name='n_samples']");
        XPathExpression xreleaseDate = xpath.compile("//DocSum/Item[@Name='PDAT']");
        XPathExpression xorganisms = xpath.compile("//DocSum/Item[@Name='taxon']");
        Object accessions = xaccession.evaluate(summaryDocument, XPathConstants.NODESET);
        NodeList accNodes = (NodeList) accessions;
        Object titles = xtitle.evaluate(summaryDocument, XPathConstants.NODESET);
        NodeList titleNodes = (NodeList) titles;
        Object samples = xnumSamples.evaluate(summaryDocument, XPathConstants.NODESET);
        NodeList sampleNodes = (NodeList) samples;
        Object dates = xreleaseDate.evaluate(summaryDocument, XPathConstants.NODESET);
        NodeList dateNodes = (NodeList) dates;
        Object organisms = xorganisms.evaluate(summaryDocument, XPathConstants.NODESET);
        NodeList orgnNodes = (NodeList) organisms;
        // Create GeoRecords using information parsed from XML file
        for (int i = 0; i < accNodes.getLength(); i++) {
            GeoRecord record = new GeoRecord();
            record.setGeoAccession("GSE" + accNodes.item(i).getTextContent());
            record.setTitle(titleNodes.item(i).getTextContent());
            record.setNumSamples(Integer.parseInt(sampleNodes.item(i).getTextContent()));
            Date date = DateUtil.convertStringToDate("yyyy/MM/dd", dateNodes.item(i).getTextContent());
            record.setReleaseDate(date);
            record.setOrganisms(this.getTaxonCollection(orgnNodes.item(i).getTextContent()));
            records.add(record);
        }
        if (records.isEmpty()) {
            GeoBrowser.log.warn("No records obtained");
        }
    } catch (ParserConfigurationException | ParseException | XPathExpressionException | SAXException e) {
        throw new IOException("Could not parse data: " + searchUrl, e);
    }
    return records;
}
Also used : Node(org.w3c.dom.Node) Element(org.w3c.dom.Element) Document(org.w3c.dom.Document) URL(java.net.URL) SAXException(org.xml.sax.SAXException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) InputStream(java.io.InputStream) NodeList(org.w3c.dom.NodeList) IOException(java.io.IOException) URLConnection(java.net.URLConnection) GeoRecord(ubic.gemma.core.loader.expression.geo.model.GeoRecord) DocumentBuilder(javax.xml.parsers.DocumentBuilder) ParseException(java.text.ParseException)

Example 7 with GeoRecord

use of ubic.gemma.core.loader.expression.geo.model.GeoRecord in project Gemma by PavlidisLab.

the class GeoBrowserServiceImpl method initLocalRecord.

private void initLocalRecord(String accession) {
    assert localInfo != null;
    if (!localInfo.containsKey(accession)) {
        localInfo.put(accession, new GeoRecord());
        localInfo.get(accession).setGeoAccession(accession);
    }
}
Also used : GeoRecord(ubic.gemma.core.loader.expression.geo.model.GeoRecord)

Example 8 with GeoRecord

use of ubic.gemma.core.loader.expression.geo.model.GeoRecord in project Gemma by PavlidisLab.

the class GeoGrabberCli method doWork.

@Override
protected Exception doWork(String[] args) {
    Exception e = super.processCommandLine(args);
    if (e != null)
        return e;
    Set<String> seen = new HashSet<>();
    GeoBrowserService gbs = this.getBean(GeoBrowserService.class);
    ExpressionExperimentService ees = this.getBean(ExpressionExperimentService.class);
    try {
        int start = 0;
        int numfails = 0;
        int chunksize = 100;
        while (true) {
            List<GeoRecord> recs = gbs.getRecentGeoRecords(start, chunksize);
            if (recs.isEmpty()) {
                AbstractCLI.log.info("No records received for start=" + start);
                numfails++;
                if (numfails > 10) {
                    AbstractCLI.log.info("Giving up");
                    break;
                }
                try {
                    Thread.sleep(500);
                } catch (InterruptedException ignored) {
                }
                start++;
                continue;
            }
            start++;
            for (GeoRecord geoRecord : recs) {
                if (seen.contains(geoRecord.getGeoAccession())) {
                    continue;
                }
                if (ees.findByShortName(geoRecord.getGeoAccession()) != null) {
                    continue;
                }
                if (!ees.findByAccession(geoRecord.getGeoAccession()).isEmpty()) {
                    continue;
                }
                System.out.println(geoRecord.getGeoAccession() + "\t" + geoRecord.getOrganisms().iterator().next() + "\t" + geoRecord.getNumSamples() + "\t" + geoRecord.getTitle() + "\t" + StringUtils.join(geoRecord.getCorrespondingExperiments(), ",") + "\t" + geoRecord.getSeriesType());
                seen.add(geoRecord.getGeoAccession());
            }
        }
    } catch (IOException | ParseException exception) {
        return exception;
    }
    return null;
}
Also used : IOException(java.io.IOException) IOException(java.io.IOException) ParseException(java.text.ParseException) GeoRecord(ubic.gemma.core.loader.expression.geo.model.GeoRecord) GeoBrowserService(ubic.gemma.core.loader.expression.geo.service.GeoBrowserService) ExpressionExperimentService(ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService) ParseException(java.text.ParseException) HashSet(java.util.HashSet)

Aggregations

GeoRecord (ubic.gemma.core.loader.expression.geo.model.GeoRecord)8 IOException (java.io.IOException)4 Test (org.junit.Test)3 InputStream (java.io.InputStream)2 URL (java.net.URL)2 URLConnection (java.net.URLConnection)2 ParseException (java.text.ParseException)2 GeoBrowser (ubic.gemma.core.loader.expression.geo.service.GeoBrowser)2 BufferedReader (java.io.BufferedReader)1 InputStreamReader (java.io.InputStreamReader)1 MalformedURLException (java.net.MalformedURLException)1 UnknownHostException (java.net.UnknownHostException)1 HashSet (java.util.HashSet)1 DocumentBuilder (javax.xml.parsers.DocumentBuilder)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 Document (org.w3c.dom.Document)1 Element (org.w3c.dom.Element)1 Node (org.w3c.dom.Node)1 NodeList (org.w3c.dom.NodeList)1 SAXException (org.xml.sax.SAXException)1