use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class PersonNameFilter method evaluateNamedEntities.
/**
* Use known person names to distinguish well-known persons that may or may
* not overlap in in the text and the namespace.
*
* <pre>
* Hillary Clinton visited New York state today.
* </pre>
*
* So, Clinton is part of a well known celebrity, and is not referring to
* Clinton, NY a town in upstate. We identify all such person names and mark
* any overlaps and co-references that coincide with tagged place names.
*
* @param placeNames
* places to NEgate
* @param persons
* named persons in doc
* @param orgs
* named orgs in doc
*/
public void evaluateNamedEntities(final List<PlaceCandidate> placeNames, final List<TaxonMatch> persons, final List<TaxonMatch> orgs) {
for (PlaceCandidate pc : placeNames) {
if (pc.isFilteredOut() || pc.isCountry) {
continue;
}
// person/celebrity
if (resolvedPersons.containsKey(pc.getTextnorm())) {
pc.setFilteredOut(true);
pc.addRule("ResolvedPerson");
continue;
}
if (resolvedOrgs.containsKey(pc.getTextnorm())) {
pc.setFilteredOut(true);
pc.addRule("ResolvedOrg");
continue;
}
for (TaxonMatch name : persons) {
// place name)
if (pc.isWithin(name)) {
pc.setFilteredOut(true);
resolvedPersons.put(pc.getTextnorm(), name.getText());
pc.addRule("ResolvedPerson");
}
}
for (TaxonMatch name : orgs) {
if (pc.isSameMatch(name)) {
// Org is 'name'
// where name is a city
pc.setFilteredOut(true);
resolvedOrgs.put(pc.getTextnorm(), name.getText());
pc.addRule("ResolvedOrg");
} else {
if (pc.isWithin(name) && !pc.isCountry) {
//
if (!pc.getTextnorm().contains(" ")) {
// Org is 'text name text'
// where name is a city, and 'name' is a single word.
pc.setFilteredOut(true);
// Do not record such instances as resolved orgs, because if the name occurs on its own
// then it is likely the locality/city in which that organization exists.
// "Detroit City Council" -- an org. Filter out just hit instance.
// "Detroit" -- mentioned later in the same doc, not an org.
//
//resolvedOrgs.put(pc.getTextnorm(), name.getText());
pc.addRule(NAME_IN_ORG_RULE);
}
}
}
}
}
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class ContextualOrganizationRule method evaluate.
@Override
public void evaluate(List<PlaceCandidate> names) {
if (!isRelevant()) {
return;
}
for (PlaceCandidate name : names) {
if (!name.hasRule(PersonNameFilter.NAME_IN_ORG_RULE)) {
continue;
}
log.debug(" City Name in Org Name? {}", name);
if (!name.isFilteredOut()) {
continue;
}
// mentioned elsewhere in document.
for (Place geo : name.getPlaces()) {
if (boundaryObserver.placeMentionCount().containsKey(geo.getHierarchicalPath())) {
name.setFilteredOut(false);
name.addRule("ContextualOrg");
reEval.add(name.getTextnorm());
continue;
}
}
}
/* Re-evaluate items that may have been filtered because the name appeared in an organization
* name where the org name was not necessarily geographically relevant until now.
*
*/
for (PlaceCandidate name : names) {
if (name.isFilteredOut() && reEval.contains(name.getTextnorm())) {
name.setFilteredOut(false);
name.addRule("ContextualOrg.Relation");
}
}
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class LocationChooserRule method debuggingHistograms.
/**
* What can we learn from assembling better stats at the document level?
* Evidence breaks down into concrete locations vs. inferred.
*
* @param names
*/
private void debuggingHistograms(List<PlaceCandidate> names) {
/*
* TODO: Is this histogram helpful.?
*
* Uniqueness or popularity of a given name.
*/
for (PlaceCandidate name : names) {
if (name.isFilteredOut()) {
continue;
}
PlaceCount x = namespace.get(name.getTextnorm());
if (x == null) {
x = new PlaceCount();
x.place = new Place(name.getTextnorm(), name.getTextnorm());
x.total = names.size();
namespace.put(name.getTextnorm(), x);
} else {
++x.count;
}
}
for (String cc : countryContext.keySet()) {
CountryCount count = countryContext.get(cc);
//log.debug("Country: {}/{} ({})", cc, count.country, count.count);
log.debug("Country: {}", count);
}
for (PlaceCount count : boundaryContext.values()) {
//log.debug("Boundary: {} ({})", count.place, count.count);
log.debug("Boundary: {}", count);
String cc = count.place.getCountryCode();
CountryCount Ccnt = inferredCountries.get(cc);
if (Ccnt == null) {
Ccnt = new CountryCount();
Ccnt.country = new Country(cc, cc);
inferredCountries.put(cc, Ccnt);
} else {
++Ccnt.count;
}
}
log.debug("Places: {}/{}", namespace.size(), namespace);
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class NameRule method evaluate.
public void evaluate(List<PlaceCandidate> names) {
for (PlaceCandidate name : names) {
/*
* This was filtered out already so ignore.
*/
if (name.isFilteredOut() || name.getChosen() != null) {
continue;
}
if (name.getTextnorm().length() < 10) {
continue;
}
String[] words = name.getTextnorm().split(" ");
boolean isPlace = P_prefixes.contains(words[0]);
boolean isAdmin1 = A1_suffixes.contains(words[words.length - 1]);
boolean isAdmin2 = A2_suffixes.contains(words[words.length - 1]);
if (!isPlace && !isAdmin1 && !isAdmin2) {
// rule does not apply
continue;
}
for (Place geo : name.getPlaces()) {
if (filterOutBySize(name, geo)) {
continue;
}
if (isPlace && geo.isPopulated()) {
name.addRule(CITY);
name.incrementPlaceScore(geo, 1.0);
} else if (isAdmin1 && geo.isAdmin1()) {
name.addRule(ADM1);
name.incrementPlaceScore(geo, 1.0);
} else if (isAdmin2 && geo.isAdministrative()) {
name.addRule(ADM2);
name.incrementPlaceScore(geo, 1.0);
}
}
}
}
use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.
the class NonsenseFilter method evaluate.
/**
* Evaluate the name in each list of names.
*
* <pre>
* doo doo - FAIL
* St. Paul - PASS
* south" bend - FAIL
* </pre>
*/
@Override
public void evaluate(List<PlaceCandidate> names) {
for (PlaceCandidate p : names) {
/*
* is Nonsense?
* For phrases upto MAX chars long:
* + does it contain irregular punctuation?
* // "...in the south. Bend it backwards...";
* // South Bend is not intended there.
*
* + does it contain a repeated syllable or word?:
* // "doo doo", "bah bah" "to to"
*/
if (p.getLength() > MAX_NONSENSE_PHRASE_LEN) {
continue;
}
/*
* Short words, with numerics. Approximately one word.
*/
if (p.getLength() < GENERIC_ONE_WORD) {
if (trivialNumerics.matcher(p.getText()).matches()) {
p.setFilteredOut(true);
p.addRule("Nonsense,Numbers");
continue;
}
}
if (irregularPunctPatterns(p.getText())) {
p.setFilteredOut(true);
p.addRule("Nonsense,Punct");
continue;
}
if (p.isLower()) {
String[] wds = tokenizer.split(p.getTextnorm());
HashSet<String> set = new HashSet<>();
for (String w : wds) {
if (set.contains(w)) {
p.setFilteredOut(true);
p.addRule("Nonsense,Repeated,Lower");
break;
}
set.add(w);
}
//continue;
}
/*
* Still here? Check for short obscure matches where diacritics mismatch.
* Cannot eliminate a candidate based on a single location. But reduce score for those that
* mismatch severely.
* NOTE: Score on each geo location is accounted for in default score. I.E., edit distance between text match and geo name.
*/
if (p.getLength() <= GENERIC_ONE_WORD) {
boolean hasValidGeo = false;
String ph1 = phoneticRedux(p.getTextnorm());
String diacriticRule = null;
log.debug("Testing phrase {} phonetic:{}", p.getTextnorm(), ph1);
for (Place geo : p.getPlaces()) {
log.debug("\tPLACE={}, {}", geo, geo.getNamenorm());
boolean geoDiacritics = TextUtils.hasDiacritics(geo.getPlaceName());
if (geoDiacritics && p.hasDiacritics) {
hasValidGeo = true;
diacriticRule = "Matched-Diacritics";
break;
}
if (!geoDiacritics && !p.hasDiacritics) {
hasValidGeo = true;
// both ASCII? not worth tracking.
break;
}
/* Pattern: Official name has accented/emphasis markings on the name, such as:
* `NAME or NAME`
* Where NAME is some Latin transliteration of non-Latin script
*/
if (geo.getNamenorm().contains(p.getTextnorm())) {
hasValidGeo = true;
diacriticRule = "Location-Contains-Name";
break;
}
if (isPhoneticMatch(ph1, geo.getNamenorm())) {
hasValidGeo = true;
diacriticRule = "Matched-Phonetic";
break;
}
log.debug("\t{} != {}", p.getTextnorm(), geo.getNamenorm());
}
if (!hasValidGeo) {
p.setFilteredOut(true);
p.addRule("Nonsense,Mismatched,Diacritic");
} else if (diacriticRule != null) {
p.addRule(diacriticRule);
}
}
}
}
Aggregations