Search in sources :

Example 21 with StringUtils.isBlank

use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.

the class SearchServiceImpl method characteristicSearchTerm.

/**
 * Perform a search on a query - it does not have to be one word, it could be "parkinson's disease"
 */
private Collection<SearchResult> characteristicSearchTerm(Collection<Class<?>> classes, String query) {
    if (SearchServiceImpl.log.isDebugEnabled())
        SearchServiceImpl.log.debug("Starting search for " + query);
    StopWatch watch = this.startTiming();
    Collection<Characteristic> cs = new HashSet<>();
    Collection<OntologyIndividual> individuals = ontologyService.findIndividuals(query);
    for (Collection<OntologyIndividual> individualbatch : BatchIterator.batches(individuals, 10)) {
        Collection<String> uris = new HashSet<>();
        for (OntologyIndividual individual : individualbatch) {
            uris.add(individual.getUri());
        }
        Collection<SearchResult> dbhits = this.dbHitsToSearchResult(characteristicService.findByUri(classes, uris), null);
        for (SearchResult crs : dbhits) {
            cs.add((Characteristic) crs.getResultObject());
        }
        if (cs.size() >= SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
            break;
        }
    }
    if (individuals.size() > 0 && watch.getTime() > 1000) {
        SearchServiceImpl.log.info("Found " + individuals.size() + " individuals matching '" + query + "' in " + watch.getTime() + "ms");
    }
    /*
         * Add characteristics that have values matching the query; this pulls in items not associated with ontology
         * terms (free text). We do this here so we can apply the query logic to the matches.
         */
    if (cs.size() < SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
        // note I changed the order of search operations so
        String dbQueryString = query.replaceAll("\\*", "");
        // this might not be wanted.
        Collection<Characteristic> valueMatches = characteristicService.findByValue(classes, dbQueryString);
        if (valueMatches != null && !valueMatches.isEmpty()) {
            cs.addAll(valueMatches);
            if (watch.getTime() > 1000) {
                SearchServiceImpl.log.info("Found " + valueMatches.size() + " characteristics matching value '" + query + "' in " + watch.getTime() + "ms");
            }
            watch.reset();
            watch.start();
        }
    }
    if (cs.size() < SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
        /*
             * Identify initial set of matches to the query.
             */
        Collection<OntologyTerm> matchingTerms = ontologyService.findTerms(query);
        if (watch.getTime() > 1000) {
            SearchServiceImpl.log.info("Found " + matchingTerms.size() + " ontology classes matching '" + query + "' in " + watch.getTime() + "ms");
        }
        /*
             * Search for child terms.
             */
        if (!matchingTerms.isEmpty()) {
            for (OntologyTerm term : matchingTerms) {
                /*
                     * In this loop, each term is a match directly to our query, and we do a depth-first fetch of the
                     * children.
                     */
                String uri = term.getUri();
                if (StringUtils.isBlank(uri))
                    continue;
                int sizeBefore = cs.size();
                this.getCharacteristicsAnnotatedToChildren(classes, term, cs);
                if (SearchServiceImpl.log.isDebugEnabled() && cs.size() > sizeBefore) {
                    SearchServiceImpl.log.debug((cs.size() - sizeBefore) + " characteristics matching children term of " + term);
                }
                if (cs.size() >= SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
                    break;
                }
            }
            if (watch.getTime() > 1000) {
                SearchServiceImpl.log.info("Found " + cs.size() + " characteristics for '" + query + "' including child terms in " + watch.getTime() + "ms");
            }
            watch.reset();
            watch.start();
        }
    }
    /*
         * Retrieve the owner objects
         */
    watch.reset();
    watch.start();
    Collection<SearchResult> matchingEntities = this.getAnnotatedEntities(classes, cs);
    if (watch.getTime() > 1000) {
        SearchServiceImpl.log.info("Retrieved " + matchingEntities.size() + " entities via characteristics for '" + query + "' in " + watch.getTime() + "ms");
    }
    if (SearchServiceImpl.log.isDebugEnabled())
        SearchServiceImpl.log.debug("End search for " + query);
    return matchingEntities;
}
Also used : Characteristic(ubic.gemma.model.common.description.Characteristic) VocabCharacteristic(ubic.gemma.model.common.description.VocabCharacteristic) OntologyIndividual(ubic.basecode.ontology.model.OntologyIndividual) OntologyTerm(ubic.basecode.ontology.model.OntologyTerm) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 22 with StringUtils.isBlank

use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.

the class SearchServiceImpl method databaseGeneSearch.

/**
 * Search the DB for genes that exactly match the given search string searches geneProducts, gene and bioSequence
 * tables
 */
private Collection<SearchResult> databaseGeneSearch(SearchSettings settings) {
    if (!settings.getUseDatabase())
        return new HashSet<>();
    StopWatch watch = this.startTiming();
    String searchString = StringEscapeUtils.unescapeJava(settings.getQuery());
    if (StringUtils.isBlank(searchString))
        return new HashSet<>();
    Collection<SearchResult> results = new HashSet<>();
    /*
         * First search by accession. If we find it, stop.
         */
    Gene result = null;
    try {
        result = geneService.findByNCBIId(Integer.parseInt(searchString));
    } catch (NumberFormatException e) {
    // 
    }
    if (result != null) {
        results.add(this.dbHitToSearchResult(result));
    } else {
        result = geneService.findByAccession(searchString, null);
        if (result != null) {
            results.add(this.dbHitToSearchResult(result));
        }
    }
    if (results.size() > 0) {
        this.filterByTaxon(settings, results, true);
        watch.stop();
        if (watch.getTime() > 1000)
            SearchServiceImpl.log.info("Gene DB search for " + searchString + " took " + watch.getTime() + " ms and found " + results.size() + " genes");
        return results;
    }
    // replace * at end with % for inexact symbol search
    String inexactString = searchString;
    Pattern pattern = Pattern.compile("\\*$");
    Matcher match = pattern.matcher(inexactString);
    inexactString = match.replaceAll("%");
    // note that at this point, the inexactString might not have a wildcard - only if the user asked for it.
    String exactString = inexactString.replaceAll("%", "");
    // if the query is shortish, always do a wild card search. This gives better behavior in 'live
    // search' situations. If we do wildcards on very short queries we get too many results.
    Collection<Gene> geneSet = new HashSet<>();
    if (searchString.length() <= 2) {
        // case 0: we got no result syet, or user entered a very short string. We search only for exact matches.
        geneSet.addAll(geneService.findByOfficialSymbolInexact(exactString));
    } else if (inexactString.endsWith("%")) {
        // case 1: user explicitly asked for wildcard. We allow this on strings of length 3 or more.
        geneSet.addAll(geneService.findByOfficialSymbolInexact(inexactString));
    } else if (searchString.length() > 3) {
        // case 2: user did not ask for a wildcard, but we add it anyway, if the string is 4 or 5 characters.
        if (!inexactString.endsWith("%")) {
            inexactString = inexactString + "%";
        }
        geneSet.addAll(geneService.findByOfficialSymbolInexact(inexactString));
    } else {
        // case 3: string is long enough, and user did not ask for wildcard.
        geneSet.addAll(geneService.findByOfficialSymbol(exactString));
    }
    /*
         * If we found a match using official symbol or name, don't bother with this
         */
    if (geneSet.isEmpty()) {
        geneSet.addAll(geneService.findByAlias(exactString));
        geneSet.addAll(geneProductService.getGenesByName(exactString));
        geneSet.addAll(geneProductService.getGenesByNcbiId(exactString));
        geneSet.addAll(bioSequenceService.getGenesByAccession(exactString));
        geneSet.addAll(bioSequenceService.getGenesByName(exactString));
        geneSet.add(geneService.findByEnsemblId(exactString));
    }
    watch.stop();
    if (watch.getTime() > 1000)
        SearchServiceImpl.log.info("Gene DB search for " + searchString + " took " + watch.getTime() + " ms and found " + geneSet.size() + " genes");
    results = this.dbHitsToSearchResult(geneSet, null);
    this.filterByTaxon(settings, results, true);
    return results;
}
Also used : Pattern(java.util.regex.Pattern) Gene(ubic.gemma.model.genome.Gene) Matcher(java.util.regex.Matcher) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 23 with StringUtils.isBlank

use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.

the class SearchServiceImpl method performSearch.

/**
 * Runs inside Compass transaction
 */
private Collection<SearchResult> performSearch(SearchSettings settings, CompassSession session) {
    StopWatch watch = this.startTiming();
    String enhancedQuery = settings.getQuery().trim();
    // noinspection ConstantConditions // Not obvious to me why that would have to be false.
    if (StringUtils.isBlank(enhancedQuery) || enhancedQuery.length() < SearchServiceImpl.MINIMUM_STRING_LENGTH_FOR_FREE_TEXT_SEARCH || enhancedQuery.equals("*"))
        return new ArrayList<>();
    CompassQuery compassQuery = session.queryBuilder().queryString(enhancedQuery).toQuery();
    SearchServiceImpl.log.debug("Parsed query: " + compassQuery);
    CompassHits hits = compassQuery.hits();
    // highlighting.
    if (((SearchSettingsImpl) settings).getDoHighlighting()) {
        if (session instanceof InternalCompassSession) {
            // always ...
            CompassMapping mapping = ((InternalCompassSession) session).getMapping();
            ResourceMapping[] rootMappings = mapping.getRootMappings();
            // should only be one rootMapping.
            this.process(rootMappings, hits);
        }
    }
    watch.stop();
    if (watch.getTime() > 100) {
        SearchServiceImpl.log.info("Getting " + hits.getLength() + " lucene hits for " + enhancedQuery + " took " + watch.getTime() + " ms");
    }
    if (watch.getTime() > 5000) {
        SearchServiceImpl.log.info("*****Extremely long Lucene Index Search!  " + hits.getLength() + " lucene hits for " + enhancedQuery + " took " + watch.getTime() + " ms");
    }
    return this.getSearchResults(hits);
}
Also used : SearchSettingsImpl(ubic.gemma.model.common.search.SearchSettingsImpl) InternalCompassSession(org.compass.core.spi.InternalCompassSession) CompassMapping(org.compass.core.mapping.CompassMapping) ResourceMapping(org.compass.core.mapping.ResourceMapping) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 24 with StringUtils.isBlank

use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.

the class GeneOntologyServiceImpl method findTerm.

@Override
public Collection<OntologyTerm> findTerm(String queryString) {
    if (!this.isReady())
        return new HashSet<>();
    if (GeneOntologyServiceImpl.log.isDebugEnabled())
        GeneOntologyServiceImpl.log.debug("Searching Gene Ontology for '" + queryString + "'");
    // make sure we are all-inclusive
    queryString = queryString.trim();
    queryString = queryString.replaceAll("\\s+", " AND ");
    StopWatch timer = new StopWatch();
    timer.start();
    Collection<OntologyResource> rawMatches = new HashSet<>();
    for (SearchIndex index : this.indices) {
        rawMatches.addAll(OntologySearch.matchIndividuals(model, index, queryString));
    }
    if (timer.getTime() > 100) {
        GeneOntologyServiceImpl.log.info("Find " + rawMatches.size() + " raw go terms from " + queryString + ": " + timer.getTime() + " ms");
    }
    timer.reset();
    timer.start();
    /*
         * Required to make sure the descriptions are filled in.
         */
    Collection<OntologyTerm> matches = new HashSet<>();
    for (OntologyResource r : rawMatches) {
        if (StringUtils.isBlank(r.getUri()))
            continue;
        OntologyTerm termForURI = GeneOntologyServiceImpl.getTermForURI(r.getUri());
        if (termForURI == null) {
            GeneOntologyServiceImpl.log.warn("No term for : " + r);
            continue;
        }
        matches.add(termForURI);
    }
    if (timer.getTime() > 100) {
        GeneOntologyServiceImpl.log.info("Convert " + rawMatches.size() + " raw go terms to terms: " + timer.getTime() + " ms");
    }
    return matches;
}
Also used : SearchIndex(ubic.basecode.ontology.search.SearchIndex) OntologyTerm(ubic.basecode.ontology.model.OntologyTerm) OntologyResource(ubic.basecode.ontology.model.OntologyResource) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 25 with StringUtils.isBlank

use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.

the class GeoFamilyParser method doParse.

private void doParse(BufferedReader dis) {
    if (dis == null) {
        throw new RuntimeException("Null reader");
    }
    this.numWarnings = 0;
    haveReadPlatformHeader = false;
    haveReadSampleDataHeader = false;
    alreadyWarnedAboutClobbering = false;
    alreadyWarnedAboutInconsistentColumnOrder = false;
    alreadyWarnedAboutDuplicateColumnName = false;
    String line;
    parsedLines = 0;
    processedDesignElements.clear();
    StopWatch timer = new StopWatch();
    timer.start();
    try {
        while ((line = dis.readLine()) != null) {
            if (StringUtils.isBlank(line)) {
                continue;
            }
            this.parseLine(line);
            if (++parsedLines % 20000 == 0 && Thread.currentThread().isInterrupted()) {
                // clean up
                dis.close();
                throw new java.util.concurrent.CancellationException("Thread was terminated during parsing. " + this.getClass());
            }
        }
        this.tidyUp();
    } catch (Exception e) {
        GeoFamilyParser.log.error("Parsing failed (Cancelled?) :" + e.getMessage());
        /*
             * This happens if there was a cancellation.
             */
        throw new RuntimeException(e);
    }
    timer.stop();
    if (timer.getTime() > 10000) {
        // 10 s
        GeoFamilyParser.log.info("Parsed total of " + parsedLines + " lines in " + String.format("%.2gs", timer.getTime() / 1000.0));
    }
    GeoFamilyParser.log.debug(this.platformLines + " platform  lines");
    int seriesDataLines = 0;
    GeoFamilyParser.log.debug(seriesDataLines + " series data lines");
    int dataSetDataLines = 0;
    GeoFamilyParser.log.debug(dataSetDataLines + " data set data lines");
    GeoFamilyParser.log.debug(this.sampleDataLines + " sample data lines");
}
Also used : java.util(java.util) InvocationTargetException(java.lang.reflect.InvocationTargetException) StopWatch(org.apache.commons.lang3.time.StopWatch)

Aggregations

StringUtils (org.apache.commons.lang3.StringUtils)54 List (java.util.List)33 Collectors (java.util.stream.Collectors)29 Map (java.util.Map)28 Set (java.util.Set)27 ArrayList (java.util.ArrayList)23 Optional (java.util.Optional)22 Collections (java.util.Collections)19 Logger (org.slf4j.Logger)19 LoggerFactory (org.slf4j.LoggerFactory)19 IOException (java.io.IOException)18 HashSet (java.util.HashSet)18 Collection (java.util.Collection)16 HashMap (java.util.HashMap)16 StopWatch (org.apache.commons.lang3.time.StopWatch)13 Autowired (org.springframework.beans.factory.annotation.Autowired)11 Slf4j (lombok.extern.slf4j.Slf4j)10 InputStream (java.io.InputStream)9 Inject (javax.inject.Inject)8 RegisteredTemplate (com.thinkbiganalytics.feedmgr.rest.model.RegisteredTemplate)7