use of org.apache.solr.client.solrj.response.QueryResponse in project qi4j-sdk by Qi4j.
the class SolrEntityQueryMixin method findEntities.
@Override
public Iterable<EntityReference> findEntities(Class<?> resultType, @Optional Specification<Composite> whereClause, @Optional OrderBy[] orderBySegments, @Optional Integer firstResult, @Optional Integer maxResults, Map<String, Object> variables) throws EntityFinderException {
try {
QuerySpecification expr = (QuerySpecification) whereClause;
SolrServer server = solr.solrServer();
NamedList<Object> list = new NamedList<Object>();
list.add("q", expr.query());
list.add("rows", maxResults != 0 ? maxResults : 10000);
list.add("start", firstResult);
if (orderBySegments != null && orderBySegments.length > 0) {
for (OrderBy orderBySegment : orderBySegments) {
String propName = ((Member) orderBySegment.property().accessor()).getName() + "_for_sort";
String order = orderBySegment.order() == OrderBy.Order.ASCENDING ? "asc" : "desc";
list.add("sort", propName + " " + order);
}
}
SolrParams solrParams = SolrParams.toSolrParams(list);
logger.debug("Search:" + list.toString());
QueryResponse query = server.query(solrParams);
SolrDocumentList results = query.getResults();
List<EntityReference> references = new ArrayList<EntityReference>(results.size());
for (SolrDocument result : results) {
references.add(EntityReference.parseEntityReference(result.getFirstValue("id").toString()));
}
return references;
} catch (SolrServerException e) {
throw new EntityFinderException(e);
}
}
use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.
the class TaxonMatcher method extractorImpl.
/**
* Implementation details -- use with or without the formal ID/buffer
* pairing.
*
* @param id
* doc id
* @param buf
* input text
* @return list of matches
* @throws ExtractionException
*/
private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException {
List<TextMatch> matches = new ArrayList<TextMatch>();
String docid = (id != null ? id : NO_DOC_ID);
Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap);
@SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
log.debug("TAGS SIZE = {}", tags.size());
/*
* Retrieve all offsets into a long list.
*/
TaxonMatch m = null;
// int x1 = -1, x2 = -1;
int tag_count = 0;
String id_prefix = docid + "#";
for (NamedList<?> tag : tags) {
m = new TaxonMatch();
m.start = ((Integer) tag.get("startOffset")).intValue();
// +1 char after
m.end = ((Integer) tag.get("endOffset")).intValue();
// last matched
// m.pattern_id = "taxtag";
++tag_count;
m.match_id = id_prefix + tag_count;
// m.setText((String) tag.get("matchText")); // Not reliable.
// matchText can be null.
m.setText(buf.substring(m.start, m.end));
if (TextUtils.countFormattingSpace(m.getText()) > 1) {
// Phrase with a single TAB is okay
continue;
}
@SuppressWarnings("unchecked") List<Integer> taxonIDs = (List<Integer>) tag.get("ids");
for (Integer solrId : taxonIDs) {
Object refData = beanMap.get(solrId);
if (refData == null) {
continue;
}
/*
* Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO'
*/
Taxon tx = (Taxon) refData;
if (this.filterNonAcronyms) {
if (tx.isAcronym && !m.isUpper()) {
continue;
}
}
m.addTaxon(tx);
}
//
if (m.hasTaxons()) {
matches.add(m);
}
}
log.debug("FOUND LABELS count={}", matches.size());
return matches;
}
use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.
the class SolrProxy method searchGazetteer.
/**
* Search an OpenSextant solr gazetteer.
*
* @param index solr server handle
* @param qparams search parameters
* @return list of places
* @throws SolrServerException on err
*/
public static List<Place> searchGazetteer(SolrServer index, SolrParams qparams) throws SolrServerException {
QueryResponse response = index.query(qparams, SolrRequest.METHOD.GET);
List<Place> places = new ArrayList<>();
SolrDocumentList docList = response.getResults();
for (SolrDocument solrDoc : docList) {
places.add(SolrProxy.createPlace(solrDoc));
}
return places;
}
use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.
the class GazetteerMatcher method searchAdvanced.
/**
* This is a variation on SolrGazetteer.search(), just this creates ScoredPlace which is
* immediately usable with scoring and ranking matches. The score for a ScoredPlace is
* created when added to PlaceCandidate: a default score is created for the place.
*
* <pre>
* Usage:
* pc = PlaceCandidate();
* list = gaz.searchAdvanced("name:Boston", true) // solr fielded query used as-is.
* for ScoredPlace p: list:
* pc.addPlace( p )
* </pre>
*
* @param place
* the place string or text; or a Solr query
* @param as_solr
* the as_solr
* @param maxLen
* max length of gazetteer place names.
* @return places List of scoreable place entries
* @throws SolrServerException
* the solr server exception
*/
public List<ScoredPlace> searchAdvanced(String place, boolean as_solr, int maxLen) throws SolrServerException {
if (as_solr) {
params.set("q", place);
} else {
// Bare keyword query needs to be quoted as "word word word"
params.set("q", "\"" + place + "\"");
}
QueryResponse response = solr.getInternalSolrServer().query(params, SolrRequest.METHOD.GET);
List<ScoredPlace> places = new ArrayList<>();
for (SolrDocument solrDoc : response.getResults()) {
/*
* Length Filter. Alternative: store name as string in solr, vice full text field
*/
if (maxLen > 0) {
String nm = SolrProxy.getString(solrDoc, "name");
if (nm.length() > maxLen) {
continue;
}
}
places.add(createPlace(solrDoc));
}
return places;
}
use of org.apache.solr.client.solrj.response.QueryResponse in project Xponents by OpenSextant.
the class GazetteerMatcher method tagText.
/**
* Geotag a document, returning PlaceCandidates for the mentions in
* document. Optionally just return the PlaceCandidates with name only and
* no Place objects attached. Names of contients are passed back as matches,
* with geo matches. Continents are filtered out by default.
*
* @param buffer
* text
* @param docid
* identity of the text
* @param tagOnly
* True if you wish to get the matched phrases only. False if you
* want the full list of Place Candidates.
* @param fld
* gazetteer field to use for tagging
* @param langid
* ISO lang ID
* @return place_candidates List of place candidates
* @throws ExtractionException
* on err
*/
public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly, String fld, String langid) throws ExtractionException {
// "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
// "startOffset":38},
// { "ids":[750308, 2769912, 2770041, 10413973, 10417546],
// "endOffset":49,
// "startOffset":41},
// ...
// "matchingDocs":{"numFound":75, "start":0, "docs":[ {
// "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, {
// "place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ]
// Reset counts.
this.defaultFilterCount = 0;
this.userFilterCount = 0;
// during post-processing tags we may have to distinguish between tagging/tokenizing
// general vs. cjk vs. ar. But not yet though.
// boolean useGeneralMode = DEFAULT_TAG_FIELD.equals(fld);
long t0 = System.currentTimeMillis();
log.debug("TEXT SIZE = {}", buffer.length());
int[] textMetrics = TextUtils.measureCase(buffer);
boolean isUpperCase = TextUtils.isUpperCaseDocument(textMetrics);
boolean isLowerCase = TextUtils.isLowerCaseDocument(textMetrics);
params.set("field", fld);
Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
QueryResponse response = tagTextCallSolrTagger(buffer, docid, beanMap);
@SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
this.tagNamesTime = response.getQTime();
long t1 = t0 + tagNamesTime;
long t2 = System.currentTimeMillis();
boolean geocode = !tagOnly;
/*
* Retrieve all offsets into a long list. These offsets will report a
* text span and all the gazetteer record IDs that are associated to
* that span. The text could either be a name, a code or some other
* abbreviation.
*
* For practical reasons the default behavior is to filter trivial spans
* given the gazetteer data that is returned for them.
*
* WARNING: lots of optimizations occur here due to the potentially
* large volume of tags and gazetteer data that is involved. And this is
* relatively early in the pipline.
*/
log.debug("DOC={} TAGS SIZE={}", docid, tags.size());
TreeMap<Integer, PlaceCandidate> candidates = new TreeMap<Integer, PlaceCandidate>();
// names matched is used only for debugging, currently.
Set<String> namesMatched = new HashSet<>();
tagLoop: for (NamedList<?> tag : tags) {
int x1 = (Integer) tag.get("startOffset");
int x2 = (Integer) tag.get("endOffset");
int len = x2 - x1;
if (len == 1) {
// Ignoring place names whose length is less than 2 chars
++this.defaultFilterCount;
continue;
}
// +1 char after last matched
// Could have enabled the "matchText" option from the tagger to get
// this, but since we already have the content as a String then
// we might as well not make the tagger do any more work.
String matchText = (String) tag.get("matchText");
// Get char immediately following match, for light NLP rules.
char postChar = 0;
char preChar = 0;
if (x2 < buffer.length()) {
postChar = buffer.charAt(x2);
}
if (x1 > 0) {
preChar = buffer.charAt(x1 - 1);
if (assessApostrophe(preChar, matchText)) {
++this.defaultFilterCount;
continue;
}
}
// be allowed. If lowercase abbreviations are allowed, then all matches are passed.
if (len < 3) {
if (!allowLowercaseAbbrev) {
if (TextUtils.isASCII(matchText) && !StringUtils.isAllUpperCase(matchText)) {
++this.defaultFilterCount;
continue;
}
}
}
if (TextUtils.countFormattingSpace(matchText) > 1) {
// Phrases with words broken across more than one line are not
// valid matches.
// Phrase with a single TAB is okay
++this.defaultFilterCount;
continue;
}
// Eliminate any newlines and extra whitespace in match
matchText = TextUtils.squeeze_whitespace(matchText);
/**
* Filter out trivial tags. Due to normalization, we tend to get
* lots of false positives that can be eliminated early. This is
* testing matches against the most general set of stop words.
*/
if (filter.filterOut(matchText)) {
++this.defaultFilterCount;
continue;
}
PlaceCandidate pc = new PlaceCandidate();
pc.start = x1;
pc.end = x2;
pc.setText(matchText);
/*
* Filter out tags that user determined ahead of time as not-places
* for their context.
*
*/
if (userfilter != null) {
if (userfilter.filterOut(pc.getTextnorm())) {
log.debug("User Filter:{}", matchText);
++this.userFilterCount;
continue;
}
}
/*
* Continent filter is needed, as many mentions of contients confuse
* real geotagging/geocoding.
*
*/
if (continents.filterOut(pc.getTextnorm())) {
pc.isContinent = true;
pc.setFilteredOut(true);
candidates.put(pc.start, pc);
continue;
}
/**
* Further testing is done if lang ID is provided AND if we have a stop list
* for that language. Otherwise, short terms are filtered out if they appear in any lang stop list.
* NOTE: internally TagFilter here checks only languages other than English, Spanish and Vietnamese.
*/
if (filter.filterOut(pc, langid, isUpperCase, isLowerCase)) {
++this.defaultFilterCount;
log.debug("STOPWORD {} {}", langid, pc.getText());
continue;
}
/*
* Found UPPER CASE text in a mixed-cased document.
* Conservatively, this is likely an acronym or some heading.
* But possibly still a valid place name.
* HEURISTIC: acronyms are relatively short.
* HEURISTIC: region codes can be acronyms and are valid places
*
* using such place candidates you may score short acronym matches lower than fully named ones.
* when inferring boundaries (states, provinces, etc)
*/
if (!isUpperCase && pc.isUpper() && len < 5) {
pc.isAcronym = true;
}
pc.hasDiacritics = TextUtils.hasDiacritics(pc.getText());
pc.setSurroundingTokens(buffer);
@SuppressWarnings("unchecked") List<Integer> placeRecordIds = (List<Integer>) tag.get("ids");
/*
* This assertion is helpful in debugging: assert
* placeRecordIds.size() == new
* HashSet<Integer>(placeRecordIds).size() : "ids should be unique";
*/
// assert!placeRecordIds.isEmpty();
namesMatched.clear();
//double maxNameBias = 0.0;
for (Integer solrId : placeRecordIds) {
log.debug("{} = {}", pc.getText(), beanMap.get(solrId));
// Yes, we must cast here.
// As long as createTag generates the correct type stored in
// beanMap we are fine.
ScoredPlace pGeo = (ScoredPlace) beanMap.get(solrId);
//
if (!allowLowercaseAbbrev && pGeo.isAbbreviation() && pc.isLower()) {
log.debug("Ignore lower case term={}", pc.getText());
// loop and not tagLoop?
continue tagLoop;
}
/*
* If text match contains "." and it matches any abbreviation,
* mark the candidate as an abbrev. TODO: Possibly best confirm
* this by sentence detection, as well. However, this pertains
* to text spans that contain "." within the bounds, and not
* likely an ending. E.g., "U.S." or "U.S" are trivial examples;
* "US" is more ambiguous, as we need to know if document is
* upperCase.
*
* Any place abbreviation will trigger isAbbreviation = true
*
* "IF YOU FIND US HERE" the term 'US' is ambiguous here, so
* it is not classified as an abbreviation. Otherwise if you have
* "My organization YAK happens to coincide with a place named Yak.
* But we first must determine if 'YAK' is a valid abbreviation for an actual place.
* HEURISTIC: place abbreviations are relatively short, e.g. one word(len=7 or less)
*/
if (len < 8 && !pc.isAbbreviation) {
assessAbbreviation(pc, pGeo, postChar, isUpperCase);
}
if (log.isDebugEnabled()) {
namesMatched.add(pGeo.getName());
}
/**
* Country names are the only names you can reasonably set ahead
* of time. All other names need to be assessed in context.
* Negate country names, e.g., "Georgia", by exception.
*/
if (pGeo.isCountry()) {
pc.isCountry = true;
}
if (geocode) {
pGeo.defaultHierarchicalPath();
// Default score for geo will be calculated in PlaceCandidate
pc.addPlace(pGeo);
}
}
// to filtering)
if (geocode && !pc.hasPlaces()) {
log.debug("Place has no places={}", pc.getText());
continue;
} else {
if (log.isDebugEnabled()) {
log.debug("Text {} matched {}", pc.getText(), namesMatched);
}
}
candidates.put(pc.start, pc);
}
// for tag
long t3 = System.currentTimeMillis();
// this.tagNamesTime = (int)(t1 - t0);
this.getNamesTime = (int) (t2 - t1);
this.totalTime = (int) (t3 - t0);
if (log.isDebugEnabled()) {
summarizeExtraction(candidates.values(), docid);
}
this.filteredTotal += this.defaultFilterCount + this.userFilterCount;
this.matchedTotal += candidates.size();
return new ArrayList<PlaceCandidate>(candidates.values());
}
Aggregations