use of org.apache.solr.client.solrj.response.QueryResponse in project ORCID-Source by ORCID.
the class FundingSubtypeSolrDaoImpl method getFundingTypes.
@Override
public List<OrgDefinedFundingTypeSolrDocument> getFundingTypes(String searchTerm, int firstResult, int maxResult) {
SolrQuery query = new SolrQuery();
query.setQuery("{!edismax qf='org-defined-funding-type^50.0 text^1.0' pf='org-defined-funding-type^50.0' mm=1 sort='score desc'}" + searchTerm + "*").setFields("*");
try {
QueryResponse queryResponse = solrServerReadOnly.query(query);
return queryResponse.getBeans(OrgDefinedFundingTypeSolrDocument.class);
} catch (SolrServerException se) {
String errorMessage = MessageFormat.format("Error when attempting to search for orgs, with search term {0}", new Object[] { searchTerm });
throw new NonTransientDataAccessResourceException(errorMessage, se);
}
}
use of org.apache.solr.client.solrj.response.QueryResponse in project ORCID-Source by ORCID.
the class SolrDaoImpl method findByOrcidAsReader.
@Override
public Reader findByOrcidAsReader(String orcid) {
SolrQuery query = new SolrQuery();
query.setQuery(ORCID + ":\"" + orcid + "\"").setFields(SCORE, ORCID, PUBLIC_PROFILE);
query.add("wt", "orcidProfile");
try {
QueryResponse queryResponse = solrServerForStreaming.query(query);
InputStream inputStream = (InputStream) queryResponse.getResponse().get("stream");
return new InputStreamReader(inputStream, "UTF-8");
} catch (SolrServerException | SolrException e) {
String errorMessage = MessageFormat.format("Error when attempting to retrieve stream for orcid {0}", new Object[] { orcid });
throw new NonTransientDataAccessResourceException(errorMessage, e);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
use of org.apache.solr.client.solrj.response.QueryResponse in project ORCID-Source by ORCID.
the class SolrDaoImpl method retrieveLastModified.
@Override
public Date retrieveLastModified(String orcid) {
SolrQuery query = new SolrQuery();
query.setQuery(ORCID + ":\"" + orcid + "\"");
query.setFields(PROFILE_LAST_MODIFIED_DATE);
try {
QueryResponse response = solrServer.query(query);
List<SolrDocument> results = response.getResults();
if (results.isEmpty()) {
return null;
} else {
return (Date) results.get(0).getFieldValue(PROFILE_LAST_MODIFIED_DATE);
}
} catch (SolrServerException e) {
throw new NonTransientDataAccessResourceException("Error retrieving last modified date from SOLR Server", e);
}
}
use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.
the class GazetteerMatcher method searchAdvanced.
/**
* This is a variation on SolrGazetteer.search(), just this creates ScoredPlace which is
* immediately usable with scoring and ranking matches. The score for a ScoredPlace is
* created when added to PlaceCandidate: a default score is created for the place.
*
* <pre>
* Usage:
* pc = PlaceCandidate();
* list = gaz.searchAdvanced("name:Boston", true) // solr fielded query used as-is.
* for ScoredPlace p: list:
* pc.addPlace( p )
* </pre>
*
* @param place
* the place string or text; or a Solr query
* @param as_solr
* the as_solr
* @param maxLen
* max length of gazetteer place names.
* @return places List of scoreable place entries
* @throws SolrServerException
* the solr server exception
*/
public List<ScoredPlace> searchAdvanced(String place, boolean as_solr, int maxLen) throws SolrServerException {
if (as_solr) {
params.set("q", place);
} else {
// Bare keyword query needs to be quoted as "word word word"
params.set("q", "\"" + place + "\"");
}
QueryResponse response = solr.getInternalSolrServer().query(params, SolrRequest.METHOD.GET);
List<ScoredPlace> places = new ArrayList<>();
for (SolrDocument solrDoc : response.getResults()) {
/*
* Length Filter. Alternative: store name as string in solr, vice full text field
*/
if (maxLen > 0) {
String nm = SolrProxy.getString(solrDoc, "name");
if (nm.length() > maxLen) {
continue;
}
}
places.add(createPlace(solrDoc));
}
return places;
}
use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.
the class GazetteerMatcher method tagText.
/**
* Geotag a document, returning PlaceCandidates for the mentions in
* document. Optionally just return the PlaceCandidates with name only and
* no Place objects attached. Names of contients are passed back as matches,
* with geo matches. Continents are filtered out by default.
*
* @param buffer
* text
* @param docid
* identity of the text
* @param tagOnly
* True if you wish to get the matched phrases only. False if you
* want the full list of Place Candidates.
* @param fld
* gazetteer field to use for tagging
* @param langid
* ISO lang ID
* @return place_candidates List of place candidates
* @throws ExtractionException
* on err
*/
public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly, String fld, String langid) throws ExtractionException {
// "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
// "startOffset":38},
// { "ids":[750308, 2769912, 2770041, 10413973, 10417546],
// "endOffset":49,
// "startOffset":41},
// ...
// "matchingDocs":{"numFound":75, "start":0, "docs":[ {
// "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, {
// "place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ]
// Reset counts.
this.defaultFilterCount = 0;
this.userFilterCount = 0;
// during post-processing tags we may have to distinguish between tagging/tokenizing
// general vs. cjk vs. ar. But not yet though.
// boolean useGeneralMode = DEFAULT_TAG_FIELD.equals(fld);
long t0 = System.currentTimeMillis();
log.debug("TEXT SIZE = {}", buffer.length());
int[] textMetrics = TextUtils.measureCase(buffer);
boolean isUpperCase = TextUtils.isUpperCaseDocument(textMetrics);
boolean isLowerCase = TextUtils.isLowerCaseDocument(textMetrics);
params.set("field", fld);
Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
QueryResponse response = tagTextCallSolrTagger(buffer, docid, beanMap);
@SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
this.tagNamesTime = response.getQTime();
long t1 = t0 + tagNamesTime;
long t2 = System.currentTimeMillis();
boolean geocode = !tagOnly;
/*
* Retrieve all offsets into a long list. These offsets will report a
* text span and all the gazetteer record IDs that are associated to
* that span. The text could either be a name, a code or some other
* abbreviation.
*
* For practical reasons the default behavior is to filter trivial spans
* given the gazetteer data that is returned for them.
*
* WARNING: lots of optimizations occur here due to the potentially
* large volume of tags and gazetteer data that is involved. And this is
* relatively early in the pipline.
*/
log.debug("DOC={} TAGS SIZE={}", docid, tags.size());
TreeMap<Integer, PlaceCandidate> candidates = new TreeMap<Integer, PlaceCandidate>();
// names matched is used only for debugging, currently.
Set<String> namesMatched = new HashSet<>();
tagLoop: for (NamedList<?> tag : tags) {
int x1 = (Integer) tag.get("startOffset");
int x2 = (Integer) tag.get("endOffset");
int len = x2 - x1;
if (len == 1) {
// Ignoring place names whose length is less than 2 chars
++this.defaultFilterCount;
continue;
}
// +1 char after last matched
// Could have enabled the "matchText" option from the tagger to get
// this, but since we already have the content as a String then
// we might as well not make the tagger do any more work.
String matchText = (String) tag.get("matchText");
// Get char immediately following match, for light NLP rules.
char postChar = 0;
char preChar = 0;
if (x2 < buffer.length()) {
postChar = buffer.charAt(x2);
}
if (x1 > 0) {
preChar = buffer.charAt(x1 - 1);
if (assessApostrophe(preChar, matchText)) {
++this.defaultFilterCount;
continue;
}
}
// be allowed. If lowercase abbreviations are allowed, then all matches are passed.
if (len < 3) {
if (!allowLowercaseAbbrev) {
if (TextUtils.isASCII(matchText) && !StringUtils.isAllUpperCase(matchText)) {
++this.defaultFilterCount;
continue;
}
}
}
if (TextUtils.countFormattingSpace(matchText) > 1) {
// Phrases with words broken across more than one line are not
// valid matches.
// Phrase with a single TAB is okay
++this.defaultFilterCount;
continue;
}
// Eliminate any newlines and extra whitespace in match
matchText = TextUtils.squeeze_whitespace(matchText);
/**
* Filter out trivial tags. Due to normalization, we tend to get
* lots of false positives that can be eliminated early. This is
* testing matches against the most general set of stop words.
*/
if (filter.filterOut(matchText)) {
++this.defaultFilterCount;
continue;
}
PlaceCandidate pc = new PlaceCandidate();
pc.start = x1;
pc.end = x2;
pc.setText(matchText);
/*
* Filter out tags that user determined ahead of time as not-places
* for their context.
*
*/
if (userfilter != null) {
if (userfilter.filterOut(pc.getTextnorm())) {
log.debug("User Filter:{}", matchText);
++this.userFilterCount;
continue;
}
}
/*
* Continent filter is needed, as many mentions of contients confuse
* real geotagging/geocoding.
*
*/
if (continents.filterOut(pc.getTextnorm())) {
pc.isContinent = true;
pc.setFilteredOut(true);
candidates.put(pc.start, pc);
continue;
}
/**
* Further testing is done if lang ID is provided AND if we have a stop list
* for that language. Otherwise, short terms are filtered out if they appear in any lang stop list.
* NOTE: internally TagFilter here checks only languages other than English, Spanish and Vietnamese.
*/
if (filter.filterOut(pc, langid, isUpperCase, isLowerCase)) {
++this.defaultFilterCount;
log.debug("STOPWORD {} {}", langid, pc.getText());
continue;
}
/*
* Found UPPER CASE text in a mixed-cased document.
* Conservatively, this is likely an acronym or some heading.
* But possibly still a valid place name.
* HEURISTIC: acronyms are relatively short.
* HEURISTIC: region codes can be acronyms and are valid places
*
* using such place candidates you may score short acronym matches lower than fully named ones.
* when inferring boundaries (states, provinces, etc)
*/
if (!isUpperCase && pc.isUpper() && len < 5) {
pc.isAcronym = true;
}
pc.hasDiacritics = TextUtils.hasDiacritics(pc.getText());
pc.setSurroundingTokens(buffer);
@SuppressWarnings("unchecked") List<Integer> placeRecordIds = (List<Integer>) tag.get("ids");
/*
* This assertion is helpful in debugging: assert
* placeRecordIds.size() == new
* HashSet<Integer>(placeRecordIds).size() : "ids should be unique";
*/
// assert!placeRecordIds.isEmpty();
namesMatched.clear();
//double maxNameBias = 0.0;
for (Integer solrId : placeRecordIds) {
log.debug("{} = {}", pc.getText(), beanMap.get(solrId));
// Yes, we must cast here.
// As long as createTag generates the correct type stored in
// beanMap we are fine.
ScoredPlace pGeo = (ScoredPlace) beanMap.get(solrId);
//
if (!allowLowercaseAbbrev && pGeo.isAbbreviation() && pc.isLower()) {
log.debug("Ignore lower case term={}", pc.getText());
// loop and not tagLoop?
continue tagLoop;
}
/*
* If text match contains "." and it matches any abbreviation,
* mark the candidate as an abbrev. TODO: Possibly best confirm
* this by sentence detection, as well. However, this pertains
* to text spans that contain "." within the bounds, and not
* likely an ending. E.g., "U.S." or "U.S" are trivial examples;
* "US" is more ambiguous, as we need to know if document is
* upperCase.
*
* Any place abbreviation will trigger isAbbreviation = true
*
* "IF YOU FIND US HERE" the term 'US' is ambiguous here, so
* it is not classified as an abbreviation. Otherwise if you have
* "My organization YAK happens to coincide with a place named Yak.
* But we first must determine if 'YAK' is a valid abbreviation for an actual place.
* HEURISTIC: place abbreviations are relatively short, e.g. one word(len=7 or less)
*/
if (len < 8 && !pc.isAbbreviation) {
assessAbbreviation(pc, pGeo, postChar, isUpperCase);
}
if (log.isDebugEnabled()) {
namesMatched.add(pGeo.getName());
}
/**
* Country names are the only names you can reasonably set ahead
* of time. All other names need to be assessed in context.
* Negate country names, e.g., "Georgia", by exception.
*/
if (pGeo.isCountry()) {
pc.isCountry = true;
}
if (geocode) {
pGeo.defaultHierarchicalPath();
// Default score for geo will be calculated in PlaceCandidate
pc.addPlace(pGeo);
}
}
// to filtering)
if (geocode && !pc.hasPlaces()) {
log.debug("Place has no places={}", pc.getText());
continue;
} else {
if (log.isDebugEnabled()) {
log.debug("Text {} matched {}", pc.getText(), namesMatched);
}
}
candidates.put(pc.start, pc);
}
// for tag
long t3 = System.currentTimeMillis();
// this.tagNamesTime = (int)(t1 - t0);
this.getNamesTime = (int) (t2 - t1);
this.totalTime = (int) (t3 - t0);
if (log.isDebugEnabled()) {
summarizeExtraction(candidates.values(), docid);
}
this.filteredTotal += this.defaultFilterCount + this.userFilterCount;
this.matchedTotal += candidates.size();
return new ArrayList<PlaceCandidate>(candidates.values());
}
Aggregations