use of org.opensextant.extractors.geo.PlaceEvidence in project Xponents by OpenSextant.
the class MajorPlaceRule method evaluate.
/**
* attach either a Capital or Admin region ID, giving it some weight based on various properties or context.
*/
@Override
public void evaluate(final PlaceCandidate name, final Place geo) {
PlaceEvidence ev = null;
if (geo.isNationalCapital()) {
// IFF no countries are mentioned, Capitals are good proxies for country.
inferCountry(geo);
ev = new PlaceEvidence(geo, CAPITAL, weight(weight + 2, geo));
} else if (geo.isAdmin1()) {
ev = new PlaceEvidence(geo, ADMIN, weight(weight, geo));
inferBoundary(geo);
} else if (popStats != null && geo.isPopulated()) {
String gh = geohash(geo);
geo.setGeohash(gh);
String prefix = gh.substring(0, GEOHASH_RESOLUTION);
if (popStats.containsKey(prefix)) {
int pop = popStats.get(prefix);
if (pop > POP_MIN) {
geo.setPopulation(pop);
//
// Natural log gives a better, slower curve for population weights.
// ln(POP_MIN=25000) = 10.1
//
// ln(22,000) = 0.0 wt=0 e^10 = 22,000
// ln(60,000) = 11.x wt=1
// ln(165,000) = 12.x wt=2
// ln(444,000) = 13.x wt=3
// Etc.
// And to make scale even more gradual, wt - 1 or wt/2, wt/3
// These population stats cannot overtake all other rules entirely.
//
int wt = (int) ((Math.log(geo.getPopulation()) - 10)) / 3;
ev = new PlaceEvidence(geo, POP, weight(wt, geo));
}
}
}
if (ev != null) {
ev.setEvaluated(true);
name.addEvidence(ev);
name.incrementPlaceScore(geo, ev.getWeight() * 0.1);
}
}
use of org.opensextant.extractors.geo.PlaceEvidence in project Xponents by OpenSextant.
the class NameCodeRule method evaluate.
/**
* Requirement: List of place candidate is a linked list.
*/
@Override
public void evaluate(final List<PlaceCandidate> names) {
for (int x = 0; x < names.size() - 1; ++x) {
PlaceCandidate name = names.get(x);
PlaceCandidate code = names.get(x + 1);
if (name.isFilteredOut() || code.isFilteredOut()) {
continue;
}
/*
* COUNTRY, STATE is not supported under this rule.
* E.g., Uruguay, Argentina ... This looks like a list of countries
* However Uruguay is a district in Argentina; Just as Georgia is a state in US
* and also a country name.
*/
if (name.isCountry) {
continue;
}
/*
* Test if SOMENAME, CODE is the case. a1.....a2.b1.., where b1 > a2
* > a1, but distance is minimal from end of name to start of code.
*
*/
if ((code.start - name.end) > MAX_CHAR_DIST) {
continue;
}
/*
* Not supporting lowercase codes/abbreviations. 'la', 'is', 'un', etc.
*/
if (code.isLower() && code.getText().length() < 4) {
continue;
}
boolean comma = false;
if (name.getPostmatchTokens() != null) {
// Proximity is one factor, but conventional format should weigh more.
if (",".equals(name.getPostmatchTokens()[0])) {
comma = true;
}
}
/*
* by this point a place name tag should be marked as a name or
* code/abbrev. Match the abbreviation with a geographic location
* that is a state, county, district, etc.
*/
Place country = code.isCountry ? code.getChosen() : null;
log.debug("{} name, code: {} in {}?", NAME, name.getText(), code.getText());
for (Place geo : code.getPlaces()) {
if (!geo.isAdministrative() || geo.getCountryCode() == null) {
continue;
}
// Provinces, states, districts, etc. Only.
//
// Make sure you can match an province name or code with the gazetteer entries found:
// Boston, Ma. ==== for 'Ma', resolve to an abbreviation for Massachusetts
// Ignore places called 'Ma'
//
// Place ('Ma') == will have gazetteer metadata indicating if this is a valid abbreviated code for a place.
// PlaceCandidate('Ma.') will have textual metadata from given text indicating if it is a code, MA, or abbrev. 'Ma.'
//
// These two situations must match here. We ignore geo locations that do not fit this profile.
//
boolean lexicalMatch = ((code.isAbbreviation && geo.isAbbreviation()) || (!code.isAbbreviation && !geo.isAbbreviation()));
//
if (!lexicalMatch) {
continue;
}
String adm1 = geo.getHierarchicalPath();
if (adm1 == null && !code.isCountry) {
log.debug("ADM1 hierarchical path should not be null");
continue;
}
// Quick determination if these two places have a containment or geopolitical connection
//
boolean contains = name.presentInHierarchy(adm1) || (country != null ? name.presentInCountry(country.getCountryCode()) : false);
if (!contains) {
continue;
}
/* CITY, STATE
* CITY, COUNTRY
*/
// Associate the CODE to the NAME that precedes it.
//
PlaceEvidence ev = new PlaceEvidence();
ev.setCountryCode(geo.getCountryCode());
ev.setAdmin1(geo.getAdmin1());
// Shunt. Evaluate this rule here.
ev.setEvaluated(true);
int wt = weight + (comma ? 2 : 0);
if (geo.isAbbreviation() && (code.isAbbreviation || code.isAcronym)) {
ev.setRule(NAME_ADMCODE_RULE);
ev.setWeight(wt + 1);
} else {
ev.setRule(NAME_ADMNAME_RULE);
ev.setWeight(wt);
}
name.addEvidence(ev);
if (boundaryObserver != null) {
boundaryObserver.boundaryLevel1InScope(geo);
}
//
for (Place nameGeo : name.getPlaces()) {
if (!(nameGeo.isPopulated() || nameGeo.isAdministrative() || nameGeo.isSpot())) {
continue;
}
if (adm1 != null && adm1.equals(nameGeo.getHierarchicalPath())) {
name.incrementPlaceScore(nameGeo, ev.getWeight());
} else if (sameCountry(nameGeo, country)) {
name.incrementPlaceScore(nameGeo, ev.getWeight());
}
}
}
}
}
use of org.opensextant.extractors.geo.PlaceEvidence in project Xponents by OpenSextant.
the class LocationChooserRule method evaluate.
/**
* Yet unchosen location.
* Consider given evidence first, creating some weight there,
* then introducing innate properties of possible locations, thereby amplifying the
* differences in the candidates.
*
*/
@Override
public void evaluate(PlaceCandidate name, Place geo) {
if (boundaryContext.isEmpty() && countryContext.isEmpty()) {
return;
}
double countryScalar = 1.0;
CountryCount ccnt = countryContext.get(geo.getCountryCode());
if (ccnt != null) {
countryScalar = GLOBAL_POINTS * ccnt.getRatio();
}
// This is inferred stuff from the document at large.
if (geo.getHierarchicalPath() != null && boundaryContext.containsKey(geo.getHierarchicalPath())) {
name.incrementPlaceScore(geo, countryScalar * ADMIN_CONTAINS_PLACE_WT);
} else if (countryContext.containsKey(geo.getCountryCode())) {
name.incrementPlaceScore(geo, countryScalar * COUNTRY_CONTAINS_PLACE_WT);
}
//
for (PlaceEvidence ev : name.getEvidence()) {
if (ev.wasEvaluated()) {
continue;
}
ev.defaultHierarchicalPath();
// Evaluate evidence
if ((ev.getAdmin1() != null && geo.getAdmin1() != null)) {
if (geo.getHierarchicalPath().equals(ev.getHierarchicalPath())) {
name.incrementPlaceScore(geo, ADMIN_CONTAINS_PLACE_WT);
}
} else {
if (geo.getCountryCode().equals(ev.getCountryCode())) {
name.incrementPlaceScore(geo, COUNTRY_CONTAINS_PLACE_WT);
}
}
ev.setEvaluated(true);
log.debug("\tEvidence: {} {}", ev, ev.getAdmin1());
}
}
Aggregations