use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.
the class SearchServiceImpl method characteristicSearchTerm.
/**
* Perform a search on a query - it does not have to be one word, it could be "parkinson's disease"
*/
private Collection<SearchResult> characteristicSearchTerm(Collection<Class<?>> classes, String query) {
if (SearchServiceImpl.log.isDebugEnabled())
SearchServiceImpl.log.debug("Starting search for " + query);
StopWatch watch = this.startTiming();
Collection<Characteristic> cs = new HashSet<>();
Collection<OntologyIndividual> individuals = ontologyService.findIndividuals(query);
for (Collection<OntologyIndividual> individualbatch : BatchIterator.batches(individuals, 10)) {
Collection<String> uris = new HashSet<>();
for (OntologyIndividual individual : individualbatch) {
uris.add(individual.getUri());
}
Collection<SearchResult> dbhits = this.dbHitsToSearchResult(characteristicService.findByUri(classes, uris), null);
for (SearchResult crs : dbhits) {
cs.add((Characteristic) crs.getResultObject());
}
if (cs.size() >= SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
break;
}
}
if (individuals.size() > 0 && watch.getTime() > 1000) {
SearchServiceImpl.log.info("Found " + individuals.size() + " individuals matching '" + query + "' in " + watch.getTime() + "ms");
}
/*
* Add characteristics that have values matching the query; this pulls in items not associated with ontology
* terms (free text). We do this here so we can apply the query logic to the matches.
*/
if (cs.size() < SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
// note I changed the order of search operations so
String dbQueryString = query.replaceAll("\\*", "");
// this might not be wanted.
Collection<Characteristic> valueMatches = characteristicService.findByValue(classes, dbQueryString);
if (valueMatches != null && !valueMatches.isEmpty()) {
cs.addAll(valueMatches);
if (watch.getTime() > 1000) {
SearchServiceImpl.log.info("Found " + valueMatches.size() + " characteristics matching value '" + query + "' in " + watch.getTime() + "ms");
}
watch.reset();
watch.start();
}
}
if (cs.size() < SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
/*
* Identify initial set of matches to the query.
*/
Collection<OntologyTerm> matchingTerms = ontologyService.findTerms(query);
if (watch.getTime() > 1000) {
SearchServiceImpl.log.info("Found " + matchingTerms.size() + " ontology classes matching '" + query + "' in " + watch.getTime() + "ms");
}
/*
* Search for child terms.
*/
if (!matchingTerms.isEmpty()) {
for (OntologyTerm term : matchingTerms) {
/*
* In this loop, each term is a match directly to our query, and we do a depth-first fetch of the
* children.
*/
String uri = term.getUri();
if (StringUtils.isBlank(uri))
continue;
int sizeBefore = cs.size();
this.getCharacteristicsAnnotatedToChildren(classes, term, cs);
if (SearchServiceImpl.log.isDebugEnabled() && cs.size() > sizeBefore) {
SearchServiceImpl.log.debug((cs.size() - sizeBefore) + " characteristics matching children term of " + term);
}
if (cs.size() >= SearchServiceImpl.MAX_CHARACTERISTIC_SEARCH_RESULTS) {
break;
}
}
if (watch.getTime() > 1000) {
SearchServiceImpl.log.info("Found " + cs.size() + " characteristics for '" + query + "' including child terms in " + watch.getTime() + "ms");
}
watch.reset();
watch.start();
}
}
/*
* Retrieve the owner objects
*/
watch.reset();
watch.start();
Collection<SearchResult> matchingEntities = this.getAnnotatedEntities(classes, cs);
if (watch.getTime() > 1000) {
SearchServiceImpl.log.info("Retrieved " + matchingEntities.size() + " entities via characteristics for '" + query + "' in " + watch.getTime() + "ms");
}
if (SearchServiceImpl.log.isDebugEnabled())
SearchServiceImpl.log.debug("End search for " + query);
return matchingEntities;
}
use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.
the class SearchServiceImpl method databaseGeneSearch.
/**
* Search the DB for genes that exactly match the given search string searches geneProducts, gene and bioSequence
* tables
*/
private Collection<SearchResult> databaseGeneSearch(SearchSettings settings) {
if (!settings.getUseDatabase())
return new HashSet<>();
StopWatch watch = this.startTiming();
String searchString = StringEscapeUtils.unescapeJava(settings.getQuery());
if (StringUtils.isBlank(searchString))
return new HashSet<>();
Collection<SearchResult> results = new HashSet<>();
/*
* First search by accession. If we find it, stop.
*/
Gene result = null;
try {
result = geneService.findByNCBIId(Integer.parseInt(searchString));
} catch (NumberFormatException e) {
//
}
if (result != null) {
results.add(this.dbHitToSearchResult(result));
} else {
result = geneService.findByAccession(searchString, null);
if (result != null) {
results.add(this.dbHitToSearchResult(result));
}
}
if (results.size() > 0) {
this.filterByTaxon(settings, results, true);
watch.stop();
if (watch.getTime() > 1000)
SearchServiceImpl.log.info("Gene DB search for " + searchString + " took " + watch.getTime() + " ms and found " + results.size() + " genes");
return results;
}
// replace * at end with % for inexact symbol search
String inexactString = searchString;
Pattern pattern = Pattern.compile("\\*$");
Matcher match = pattern.matcher(inexactString);
inexactString = match.replaceAll("%");
// note that at this point, the inexactString might not have a wildcard - only if the user asked for it.
String exactString = inexactString.replaceAll("%", "");
// if the query is shortish, always do a wild card search. This gives better behavior in 'live
// search' situations. If we do wildcards on very short queries we get too many results.
Collection<Gene> geneSet = new HashSet<>();
if (searchString.length() <= 2) {
// case 0: we got no result syet, or user entered a very short string. We search only for exact matches.
geneSet.addAll(geneService.findByOfficialSymbolInexact(exactString));
} else if (inexactString.endsWith("%")) {
// case 1: user explicitly asked for wildcard. We allow this on strings of length 3 or more.
geneSet.addAll(geneService.findByOfficialSymbolInexact(inexactString));
} else if (searchString.length() > 3) {
// case 2: user did not ask for a wildcard, but we add it anyway, if the string is 4 or 5 characters.
if (!inexactString.endsWith("%")) {
inexactString = inexactString + "%";
}
geneSet.addAll(geneService.findByOfficialSymbolInexact(inexactString));
} else {
// case 3: string is long enough, and user did not ask for wildcard.
geneSet.addAll(geneService.findByOfficialSymbol(exactString));
}
/*
* If we found a match using official symbol or name, don't bother with this
*/
if (geneSet.isEmpty()) {
geneSet.addAll(geneService.findByAlias(exactString));
geneSet.addAll(geneProductService.getGenesByName(exactString));
geneSet.addAll(geneProductService.getGenesByNcbiId(exactString));
geneSet.addAll(bioSequenceService.getGenesByAccession(exactString));
geneSet.addAll(bioSequenceService.getGenesByName(exactString));
geneSet.add(geneService.findByEnsemblId(exactString));
}
watch.stop();
if (watch.getTime() > 1000)
SearchServiceImpl.log.info("Gene DB search for " + searchString + " took " + watch.getTime() + " ms and found " + geneSet.size() + " genes");
results = this.dbHitsToSearchResult(geneSet, null);
this.filterByTaxon(settings, results, true);
return results;
}
use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.
the class SearchServiceImpl method performSearch.
/**
* Runs inside Compass transaction
*/
private Collection<SearchResult> performSearch(SearchSettings settings, CompassSession session) {
StopWatch watch = this.startTiming();
String enhancedQuery = settings.getQuery().trim();
// noinspection ConstantConditions // Not obvious to me why that would have to be false.
if (StringUtils.isBlank(enhancedQuery) || enhancedQuery.length() < SearchServiceImpl.MINIMUM_STRING_LENGTH_FOR_FREE_TEXT_SEARCH || enhancedQuery.equals("*"))
return new ArrayList<>();
CompassQuery compassQuery = session.queryBuilder().queryString(enhancedQuery).toQuery();
SearchServiceImpl.log.debug("Parsed query: " + compassQuery);
CompassHits hits = compassQuery.hits();
// highlighting.
if (((SearchSettingsImpl) settings).getDoHighlighting()) {
if (session instanceof InternalCompassSession) {
// always ...
CompassMapping mapping = ((InternalCompassSession) session).getMapping();
ResourceMapping[] rootMappings = mapping.getRootMappings();
// should only be one rootMapping.
this.process(rootMappings, hits);
}
}
watch.stop();
if (watch.getTime() > 100) {
SearchServiceImpl.log.info("Getting " + hits.getLength() + " lucene hits for " + enhancedQuery + " took " + watch.getTime() + " ms");
}
if (watch.getTime() > 5000) {
SearchServiceImpl.log.info("*****Extremely long Lucene Index Search! " + hits.getLength() + " lucene hits for " + enhancedQuery + " took " + watch.getTime() + " ms");
}
return this.getSearchResults(hits);
}
use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.
the class GeneOntologyServiceImpl method findTerm.
@Override
public Collection<OntologyTerm> findTerm(String queryString) {
if (!this.isReady())
return new HashSet<>();
if (GeneOntologyServiceImpl.log.isDebugEnabled())
GeneOntologyServiceImpl.log.debug("Searching Gene Ontology for '" + queryString + "'");
// make sure we are all-inclusive
queryString = queryString.trim();
queryString = queryString.replaceAll("\\s+", " AND ");
StopWatch timer = new StopWatch();
timer.start();
Collection<OntologyResource> rawMatches = new HashSet<>();
for (SearchIndex index : this.indices) {
rawMatches.addAll(OntologySearch.matchIndividuals(model, index, queryString));
}
if (timer.getTime() > 100) {
GeneOntologyServiceImpl.log.info("Find " + rawMatches.size() + " raw go terms from " + queryString + ": " + timer.getTime() + " ms");
}
timer.reset();
timer.start();
/*
* Required to make sure the descriptions are filled in.
*/
Collection<OntologyTerm> matches = new HashSet<>();
for (OntologyResource r : rawMatches) {
if (StringUtils.isBlank(r.getUri()))
continue;
OntologyTerm termForURI = GeneOntologyServiceImpl.getTermForURI(r.getUri());
if (termForURI == null) {
GeneOntologyServiceImpl.log.warn("No term for : " + r);
continue;
}
matches.add(termForURI);
}
if (timer.getTime() > 100) {
GeneOntologyServiceImpl.log.info("Convert " + rawMatches.size() + " raw go terms to terms: " + timer.getTime() + " ms");
}
return matches;
}
use of org.apache.commons.lang3.StringUtils.isBlank in project Gemma by PavlidisLab.
the class GeoFamilyParser method doParse.
private void doParse(BufferedReader dis) {
if (dis == null) {
throw new RuntimeException("Null reader");
}
this.numWarnings = 0;
haveReadPlatformHeader = false;
haveReadSampleDataHeader = false;
alreadyWarnedAboutClobbering = false;
alreadyWarnedAboutInconsistentColumnOrder = false;
alreadyWarnedAboutDuplicateColumnName = false;
String line;
parsedLines = 0;
processedDesignElements.clear();
StopWatch timer = new StopWatch();
timer.start();
try {
while ((line = dis.readLine()) != null) {
if (StringUtils.isBlank(line)) {
continue;
}
this.parseLine(line);
if (++parsedLines % 20000 == 0 && Thread.currentThread().isInterrupted()) {
// clean up
dis.close();
throw new java.util.concurrent.CancellationException("Thread was terminated during parsing. " + this.getClass());
}
}
this.tidyUp();
} catch (Exception e) {
GeoFamilyParser.log.error("Parsing failed (Cancelled?) :" + e.getMessage());
/*
* This happens if there was a cancellation.
*/
throw new RuntimeException(e);
}
timer.stop();
if (timer.getTime() > 10000) {
// 10 s
GeoFamilyParser.log.info("Parsed total of " + parsedLines + " lines in " + String.format("%.2gs", timer.getTime() / 1000.0));
}
GeoFamilyParser.log.debug(this.platformLines + " platform lines");
int seriesDataLines = 0;
GeoFamilyParser.log.debug(seriesDataLines + " series data lines");
int dataSetDataLines = 0;
GeoFamilyParser.log.debug(dataSetDataLines + " data set data lines");
GeoFamilyParser.log.debug(this.sampleDataLines + " sample data lines");
}
Aggregations