use of org.opensextant.extractors.xtax.TaxonMatch in project Xponents by OpenSextant.
the class Transforms method parseAnnotation.
/**
* Convert JSON object for an annotation into a Xponents TextMatch instance.
* Parsing data from JSON/REST representations has very limited capability compared to
* using Java API for processing routines directly.
*
* @param data
* @return
* @throws JSONException
*/
public static TextMatch parseAnnotation(Object data) throws JSONException {
if (!(data instanceof JSONObject)) {
return null;
}
TextMatch m = null;
JSONObject a = (JSONObject) data;
TaxonMatch x = null;
String typ = a.getString("type");
String text = a.getString("matchtext");
switch(typ) {
case "place":
PlaceCandidate placeMatch = new PlaceCandidate();
Place geo = new Place();
placeMatch.setText(text);
Transforms.parseGeocoding(geo, a);
placeMatch.setConfidence(a.optInt("confidence", -1));
placeMatch.choose(geo);
m = placeMatch;
break;
case "coordinate":
GeocoordMatch coord = new GeocoordMatch();
Place coordLoc = new Place();
coord.setText(text);
// How awful:.... need to parse Coord directly
Transforms.parseGeocoding(coordLoc, a);
coord.setLatLon(coordLoc);
coord.setMethod(coordLoc.getMethod());
/* TODO: GeocoordMatch needs to support setters for Geocoding here.
* missing reverse geo info
*
* cc, adm1
*
*/
m = coord;
break;
case "country":
PlaceCandidate countryMatch = new PlaceCandidate();
Place cc = new Place();
countryMatch.setText(text);
cc.setName(text);
countryMatch.setConfidence(a.optInt("confidence", -1));
cc.setCountryCode(a.getString("cc"));
countryMatch.isCountry = true;
countryMatch.choose(cc);
m = countryMatch;
break;
case "person":
x = new TaxonMatch();
Transforms.parseTaxon(x, "person", a);
m = x;
break;
case "org":
x = new TaxonMatch();
Transforms.parseTaxon(x, "org", a);
m = x;
break;
case "taxon":
x = new TaxonMatch();
Transforms.parseTaxon(x, "taxon", a);
m = x;
break;
default:
throw new JSONException("Unknown Annotation " + typ);
}
m.setType(typ);
m.start = a.getInt("offset");
m.end = m.start + a.getInt("length");
return m;
}
use of org.opensextant.extractors.xtax.TaxonMatch in project Xponents by OpenSextant.
the class XponentsGeotagger method format.
private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
Representation result = null;
int tagCount = 0;
JSONObject resultContent = new JSONObject();
JSONObject resultMeta = new JSONObject();
resultMeta.put("status", "ok");
resultMeta.put("numfound", 0);
JSONArray resultArray = new JSONArray();
/*
* Super loop: Iterate through all found entities. record Taxons as
* person or orgs record Geo tags as country, place, or geo. geo =
* geocoded place or parsed coordinate (MGRS, DMS, etc)
*
*/
for (TextMatch name : matches) {
/*
* ==========================
* ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
* ==========================
*/
if (name instanceof TaxonMatch) {
if (jobParams.output_taxons) {
TaxonMatch match = (TaxonMatch) name;
++tagCount;
for (Taxon n : match.getTaxons()) {
JSONObject node = populateMatch(name);
String t = "taxon";
String taxon_name = n.name.toLowerCase();
if (taxon_name.startsWith("org.")) {
t = "org";
} else if (taxon_name.startsWith("person.")) {
t = "person";
}
node.put("type", t);
// Name of taxon
node.put("taxon", n.name);
// Name of catalog or source
node.put("catalog", n.catalog);
// node.put("filtered-out", true);
resultArray.put(node);
break;
}
}
continue;
}
// Ignore non-place tags
if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
continue;
}
JSONObject node = populateMatch(name);
/*
* ==========================
* ANNOTATIONS: coordinates
* ==========================
*/
if (name instanceof GeocoordMatch) {
++tagCount;
GeocoordMatch geo = (GeocoordMatch) name;
node.put("type", "coordinate");
Transforms.createGeocoding(geo, node);
resultArray.put(node);
continue;
}
if (name.isFilteredOut()) {
debug("Filtered out " + name.getText());
continue;
}
PlaceCandidate place = (PlaceCandidate) name;
Place resolvedPlace = place.getChosen();
/*
* ==========================
* ANNOTATIONS: countries, places, etc.
* ==========================
*/
/*
* Accept all country names as potential geotags Else if name can be
* filtered out, do it now. Otherwise it is a valid place name to
* consider
*/
++tagCount;
if (place.isCountry) {
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "country");
node.put("cc", resolvedPlace.getCountryCode());
node.put("confidence", place.getConfidence());
} else {
/*
* Conf = 20 or greater to be geocoded.
*/
Transforms.createGeocoding(resolvedPlace, node);
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "place");
node.put("confidence", place.getConfidence());
if (place.getConfidence() <= 10) {
node.put("filtered-out", true);
}
}
resultArray.put(node);
}
resultMeta.put("numfound", tagCount);
resultContent.put("response", resultMeta);
resultContent.put("annotations", resultArray);
result = new JsonRepresentation(resultContent.toString(2));
result.setCharacterSet(CharacterSet.UTF_8);
return result;
}
use of org.opensextant.extractors.xtax.TaxonMatch in project Xponents by OpenSextant.
the class PersonNameFilter method evaluateNamedEntities.
/**
* Use known person names to distinguish well-known persons that may or may
* not overlap in in the text and the namespace.
*
* <pre>
* Hillary Clinton visited New York state today.
* </pre>
*
* So, Clinton is part of a well known celebrity, and is not referring to
* Clinton, NY a town in upstate. We identify all such person names and mark
* any overlaps and co-references that coincide with tagged place names.
*
* @param placeNames
* places to NEgate
* @param persons
* named persons in doc
* @param orgs
* named orgs in doc
*/
public void evaluateNamedEntities(final List<PlaceCandidate> placeNames, final List<TaxonMatch> persons, final List<TaxonMatch> orgs) {
for (PlaceCandidate pc : placeNames) {
if (pc.isFilteredOut() || pc.isCountry) {
continue;
}
// person/celebrity
if (resolvedPersons.containsKey(pc.getTextnorm())) {
pc.setFilteredOut(true);
pc.addRule("ResolvedPerson");
continue;
}
if (resolvedOrgs.containsKey(pc.getTextnorm())) {
pc.setFilteredOut(true);
pc.addRule("ResolvedOrg");
continue;
}
for (TaxonMatch name : persons) {
// place name)
if (pc.isWithin(name)) {
pc.setFilteredOut(true);
resolvedPersons.put(pc.getTextnorm(), name.getText());
pc.addRule("ResolvedPerson");
}
}
for (TaxonMatch name : orgs) {
if (pc.isSameMatch(name)) {
// Org is 'name'
// where name is a city
pc.setFilteredOut(true);
resolvedOrgs.put(pc.getTextnorm(), name.getText());
pc.addRule("ResolvedOrg");
} else {
if (pc.isWithin(name) && !pc.isCountry) {
//
if (!pc.getTextnorm().contains(" ")) {
// Org is 'text name text'
// where name is a city, and 'name' is a single word.
pc.setFilteredOut(true);
// Do not record such instances as resolved orgs, because if the name occurs on its own
// then it is likely the locality/city in which that organization exists.
// "Detroit City Council" -- an org. Filter out just hit instance.
// "Detroit" -- mentioned later in the same doc, not an org.
//
//resolvedOrgs.put(pc.getTextnorm(), name.getText());
pc.addRule(NAME_IN_ORG_RULE);
}
}
}
}
}
}
use of org.opensextant.extractors.xtax.TaxonMatch in project Xponents by OpenSextant.
the class PlaceGeocoder method parseKnownNonPlaces.
/**
* If no geo matches are found, we still parse the data if person name matching is enabled.
* Poor-man's named-entity extraction
*
* @throws ExtractionException
*
*/
private void parseKnownNonPlaces(TextInput input, List<PlaceCandidate> candidates, List<TextMatch> matches) {
if (!isPersonNameMatchingEnabled()) {
return;
}
// If this step fails miserably, do not raise error. Log the error and return nothing found.
//
List<TextMatch> nonPlaces = null;
try {
nonPlaces = personMatcher.extract(input.buffer);
if (nonPlaces.isEmpty()) {
return;
}
} catch (Exception err) {
log.error(err.getMessage());
return;
}
List<TaxonMatch> persons = new ArrayList<>();
List<TaxonMatch> orgs = new ArrayList<>();
log.debug("Matched {}", nonPlaces.size());
for (TextMatch tm : nonPlaces) {
if (!(tm instanceof TaxonMatch)) {
continue;
}
TaxonMatch tag = (TaxonMatch) tm;
//
// For the purposes of geocoding/geoparsing filter out ALL
// TaxonMatches. Any place names should reside back in
// gazetteer. If XTax does have place or location data, that would be new.
//
tm.setFilteredOut(true);
for (Taxon taxon : tag.getTaxons()) {
String node = taxon.name.toLowerCase();
// name spans that are not places.
if (node.startsWith("person.")) {
persons.add(tag);
break;
} else if (node.startsWith("org.")) {
if (taxon.isAcronym && !tm.isUpper()) {
continue;
}
orgs.add(tag);
break;
} else if (node.startsWith("nationality.")) {
persons.add(tag);
// The tag may be absent as some ethnicities may be mixed in and indicate no country.
for (String t : taxon.tagset) {
int x = t.indexOf("cc+");
if (x >= 0) {
String isocode = t.substring(x + 3);
this.countryInScope(isocode);
nationalities.put(tag.getText(), isocode);
}
}
}
}
}
personNameRule.evaluateNamedEntities(candidates, persons, orgs);
matches.addAll(persons);
matches.addAll(orgs);
}
use of org.opensextant.extractors.xtax.TaxonMatch in project Xponents by OpenSextant.
the class KeywordTaggerMapper method match2JSON.
/**
* Convert a TextMatch (Place, Taxon, Pattern, etc.) and convert to JSON.
* @param tm
* @return
*/
public static final JSONObject match2JSON(TextMatch tm) {
JSONObject j = prepareOutput(tm);
if (tm instanceof TaxonMatch) {
for (Taxon tx : ((TaxonMatch) tm).getTaxons()) {
j.put("name", tx.name);
j.put("cat", tx.catalog);
j.put("type", getTypeLabel(tx));
break;
/* Demo: we only capture the first Taxon Match */
}
}
return j;
}
Aggregations