use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class ContextualOrganizationRule method evaluate.
@Override
public void evaluate(List<PlaceCandidate> names) {
if (!isRelevant()) {
return;
}
for (PlaceCandidate name : names) {
if (!name.hasRule(PersonNameFilter.NAME_IN_ORG_RULE)) {
continue;
}
log.debug(" City Name in Org Name? {}", name);
if (!name.isFilteredOut()) {
continue;
}
// mentioned elsewhere in document.
for (Place geo : name.getPlaces()) {
if (boundaryObserver.placeMentionCount().containsKey(geo.getHierarchicalPath())) {
name.setFilteredOut(false);
name.addRule("ContextualOrg");
reEval.add(name.getTextnorm());
continue;
}
}
}
/* Re-evaluate items that may have been filtered because the name appeared in an organization
* name where the org name was not necessarily geographically relevant until now.
*
*/
for (PlaceCandidate name : names) {
if (name.isFilteredOut() && reEval.contains(name.getTextnorm())) {
name.setFilteredOut(false);
name.addRule("ContextualOrg.Relation");
}
}
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class Transforms method parseAnnotation.
/**
* Convert JSON object for an annotation into a Xponents TextMatch instance.
* Parsing data from JSON/REST representations has very limited capability compared to
* using Java API for processing routines directly.
*
* @param data
* @return
* @throws JSONException
*/
public static TextMatch parseAnnotation(Object data) throws JSONException {
if (!(data instanceof JSONObject)) {
return null;
}
TextMatch m = null;
JSONObject a = (JSONObject) data;
TaxonMatch x = null;
String typ = a.getString("type");
String text = a.getString("matchtext");
switch(typ) {
case "place":
PlaceCandidate placeMatch = new PlaceCandidate();
Place geo = new Place();
placeMatch.setText(text);
Transforms.parseGeocoding(geo, a);
placeMatch.setConfidence(a.optInt("confidence", -1));
placeMatch.choose(geo);
m = placeMatch;
break;
case "coordinate":
GeocoordMatch coord = new GeocoordMatch();
Place coordLoc = new Place();
coord.setText(text);
// How awful:.... need to parse Coord directly
Transforms.parseGeocoding(coordLoc, a);
coord.setLatLon(coordLoc);
coord.setMethod(coordLoc.getMethod());
/* TODO: GeocoordMatch needs to support setters for Geocoding here.
* missing reverse geo info
*
* cc, adm1
*
*/
m = coord;
break;
case "country":
PlaceCandidate countryMatch = new PlaceCandidate();
Place cc = new Place();
countryMatch.setText(text);
cc.setName(text);
countryMatch.setConfidence(a.optInt("confidence", -1));
cc.setCountryCode(a.getString("cc"));
countryMatch.isCountry = true;
countryMatch.choose(cc);
m = countryMatch;
break;
case "person":
x = new TaxonMatch();
Transforms.parseTaxon(x, "person", a);
m = x;
break;
case "org":
x = new TaxonMatch();
Transforms.parseTaxon(x, "org", a);
m = x;
break;
case "taxon":
x = new TaxonMatch();
Transforms.parseTaxon(x, "taxon", a);
m = x;
break;
default:
throw new JSONException("Unknown Annotation " + typ);
}
m.setType(typ);
m.start = a.getInt("offset");
m.end = m.start + a.getInt("length");
return m;
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class XponentsGeotagger method format.
private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
Representation result = null;
int tagCount = 0;
JSONObject resultContent = new JSONObject();
JSONObject resultMeta = new JSONObject();
resultMeta.put("status", "ok");
resultMeta.put("numfound", 0);
JSONArray resultArray = new JSONArray();
/*
* Super loop: Iterate through all found entities. record Taxons as
* person or orgs record Geo tags as country, place, or geo. geo =
* geocoded place or parsed coordinate (MGRS, DMS, etc)
*
*/
for (TextMatch name : matches) {
/*
* ==========================
* ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
* ==========================
*/
if (name instanceof TaxonMatch) {
if (jobParams.output_taxons) {
TaxonMatch match = (TaxonMatch) name;
++tagCount;
for (Taxon n : match.getTaxons()) {
JSONObject node = populateMatch(name);
String t = "taxon";
String taxon_name = n.name.toLowerCase();
if (taxon_name.startsWith("org.")) {
t = "org";
} else if (taxon_name.startsWith("person.")) {
t = "person";
}
node.put("type", t);
// Name of taxon
node.put("taxon", n.name);
// Name of catalog or source
node.put("catalog", n.catalog);
// node.put("filtered-out", true);
resultArray.put(node);
break;
}
}
continue;
}
// Ignore non-place tags
if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
continue;
}
JSONObject node = populateMatch(name);
/*
* ==========================
* ANNOTATIONS: coordinates
* ==========================
*/
if (name instanceof GeocoordMatch) {
++tagCount;
GeocoordMatch geo = (GeocoordMatch) name;
node.put("type", "coordinate");
Transforms.createGeocoding(geo, node);
resultArray.put(node);
continue;
}
if (name.isFilteredOut()) {
debug("Filtered out " + name.getText());
continue;
}
PlaceCandidate place = (PlaceCandidate) name;
Place resolvedPlace = place.getChosen();
/*
* ==========================
* ANNOTATIONS: countries, places, etc.
* ==========================
*/
/*
* Accept all country names as potential geotags Else if name can be
* filtered out, do it now. Otherwise it is a valid place name to
* consider
*/
++tagCount;
if (place.isCountry) {
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "country");
node.put("cc", resolvedPlace.getCountryCode());
node.put("confidence", place.getConfidence());
} else {
/*
* Conf = 20 or greater to be geocoded.
*/
Transforms.createGeocoding(resolvedPlace, node);
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "place");
node.put("confidence", place.getConfidence());
if (place.getConfidence() <= 10) {
node.put("filtered-out", true);
}
}
resultArray.put(node);
}
resultMeta.put("numfound", tagCount);
resultContent.put("response", resultMeta);
resultContent.put("annotations", resultArray);
result = new JsonRepresentation(resultContent.toString(2));
result.setCharacterSet(CharacterSet.UTF_8);
return result;
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class LocationChooserRule method evaluate.
/**
* Walk the entire list.
*/
public void evaluate(List<PlaceCandidate> names) {
// INPUTS:
// histogram of country mentions
// resolved/relevant provinces (PlaceEvidence)
// resolved/relevant locations attached to places (PlaceEvidence)
//
// MEASURES:
// # of distinct countries == density, focus. Is this document about one or two countries,
// or is it a world news report on everything.
//
countryContext = countryObserver.countryMentionCount();
boundaryContext = boundaryObserver.placeMentionCount();
/* TODO: DEBUG through location chooser using histograms
* of found and resolved place metadata.
*
*/
if (log.isDebugEnabled()) {
debuggingHistograms(names);
}
for (PlaceCandidate name : names) {
if (name.isFilteredOut() || name.isCountry) {
continue;
}
if (name.getChosen() != null) {
// DONE
continue;
}
//
for (Place geo : name.getPlaces()) {
evaluate(name, geo);
}
name.choose();
if (name.getChosen() != null) {
this.assessConfidence(name);
documentResolvedLocations.put(name.getTextnorm(), name.getChosen());
} else {
log.info("Place name is ambiguous: {} in N={} places", name.getText(), name.distinctLocationCount());
}
}
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class NameCodeRule method evaluate.
/**
* Requirement: List of place candidate is a linked list.
*/
@Override
public void evaluate(final List<PlaceCandidate> names) {
for (int x = 0; x < names.size() - 1; ++x) {
PlaceCandidate name = names.get(x);
PlaceCandidate code = names.get(x + 1);
if (name.isFilteredOut() || code.isFilteredOut()) {
continue;
}
/*
* COUNTRY, STATE is not supported under this rule.
* E.g., Uruguay, Argentina ... This looks like a list of countries
* However Uruguay is a district in Argentina; Just as Georgia is a state in US
* and also a country name.
*/
if (name.isCountry) {
continue;
}
/*
* Test if SOMENAME, CODE is the case. a1.....a2.b1.., where b1 > a2
* > a1, but distance is minimal from end of name to start of code.
*
*/
if ((code.start - name.end) > MAX_CHAR_DIST) {
continue;
}
/*
* Not supporting lowercase codes/abbreviations. 'la', 'is', 'un', etc.
*/
if (code.isLower() && code.getText().length() < 4) {
continue;
}
boolean comma = false;
if (name.getPostmatchTokens() != null) {
// Proximity is one factor, but conventional format should weigh more.
if (",".equals(name.getPostmatchTokens()[0])) {
comma = true;
}
}
/*
* by this point a place name tag should be marked as a name or
* code/abbrev. Match the abbreviation with a geographic location
* that is a state, county, district, etc.
*/
Place country = code.isCountry ? code.getChosen() : null;
log.debug("{} name, code: {} in {}?", NAME, name.getText(), code.getText());
for (Place geo : code.getPlaces()) {
if (!geo.isAdministrative() || geo.getCountryCode() == null) {
continue;
}
// Provinces, states, districts, etc. Only.
//
// Make sure you can match an province name or code with the gazetteer entries found:
// Boston, Ma. ==== for 'Ma', resolve to an abbreviation for Massachusetts
// Ignore places called 'Ma'
//
// Place ('Ma') == will have gazetteer metadata indicating if this is a valid abbreviated code for a place.
// PlaceCandidate('Ma.') will have textual metadata from given text indicating if it is a code, MA, or abbrev. 'Ma.'
//
// These two situations must match here. We ignore geo locations that do not fit this profile.
//
boolean lexicalMatch = ((code.isAbbreviation && geo.isAbbreviation()) || (!code.isAbbreviation && !geo.isAbbreviation()));
//
if (!lexicalMatch) {
continue;
}
String adm1 = geo.getHierarchicalPath();
if (adm1 == null && !code.isCountry) {
log.debug("ADM1 hierarchical path should not be null");
continue;
}
// Quick determination if these two places have a containment or geopolitical connection
//
boolean contains = name.presentInHierarchy(adm1) || (country != null ? name.presentInCountry(country.getCountryCode()) : false);
if (!contains) {
continue;
}
/* CITY, STATE
* CITY, COUNTRY
*/
// Associate the CODE to the NAME that precedes it.
//
PlaceEvidence ev = new PlaceEvidence();
ev.setCountryCode(geo.getCountryCode());
ev.setAdmin1(geo.getAdmin1());
// Shunt. Evaluate this rule here.
ev.setEvaluated(true);
int wt = weight + (comma ? 2 : 0);
if (geo.isAbbreviation() && (code.isAbbreviation || code.isAcronym)) {
ev.setRule(NAME_ADMCODE_RULE);
ev.setWeight(wt + 1);
} else {
ev.setRule(NAME_ADMNAME_RULE);
ev.setWeight(wt);
}
name.addEvidence(ev);
if (boundaryObserver != null) {
boundaryObserver.boundaryLevel1InScope(geo);
}
//
for (Place nameGeo : name.getPlaces()) {
if (!(nameGeo.isPopulated() || nameGeo.isAdministrative() || nameGeo.isSpot())) {
continue;
}
if (adm1 != null && adm1.equals(nameGeo.getHierarchicalPath())) {
name.incrementPlaceScore(nameGeo, ev.getWeight());
} else if (sameCountry(nameGeo, country)) {
name.incrementPlaceScore(nameGeo, ev.getWeight());
}
}
}
}
}
Aggregations