use of org.opensextant.data.Taxon in project Xponents by OpenSextant.
the class PlaceGeocoder method parseKnownNonPlaces.
/**
* If no geo matches are found, we still parse the data if person name matching is enabled.
* Poor-man's named-entity extraction
*
* @throws ExtractionException
*
*/
private void parseKnownNonPlaces(TextInput input, List<PlaceCandidate> candidates, List<TextMatch> matches) {
if (!isPersonNameMatchingEnabled()) {
return;
}
// If this step fails miserably, do not raise error. Log the error and return nothing found.
//
List<TextMatch> nonPlaces = null;
try {
nonPlaces = personMatcher.extract(input.buffer);
if (nonPlaces.isEmpty()) {
return;
}
} catch (Exception err) {
log.error(err.getMessage());
return;
}
List<TaxonMatch> persons = new ArrayList<>();
List<TaxonMatch> orgs = new ArrayList<>();
log.debug("Matched {}", nonPlaces.size());
for (TextMatch tm : nonPlaces) {
if (!(tm instanceof TaxonMatch)) {
continue;
}
TaxonMatch tag = (TaxonMatch) tm;
//
// For the purposes of geocoding/geoparsing filter out ALL
// TaxonMatches. Any place names should reside back in
// gazetteer. If XTax does have place or location data, that would be new.
//
tm.setFilteredOut(true);
for (Taxon taxon : tag.getTaxons()) {
String node = taxon.name.toLowerCase();
// name spans that are not places.
if (node.startsWith("person.")) {
persons.add(tag);
break;
} else if (node.startsWith("org.")) {
if (taxon.isAcronym && !tm.isUpper()) {
continue;
}
orgs.add(tag);
break;
} else if (node.startsWith("nationality.")) {
persons.add(tag);
// The tag may be absent as some ethnicities may be mixed in and indicate no country.
for (String t : taxon.tagset) {
int x = t.indexOf("cc+");
if (x >= 0) {
String isocode = t.substring(x + 3);
this.countryInScope(isocode);
nationalities.put(tag.getText(), isocode);
}
}
}
}
}
personNameRule.evaluateNamedEntities(candidates, persons, orgs);
matches.addAll(persons);
matches.addAll(orgs);
}
use of org.opensextant.data.Taxon in project Xponents by OpenSextant.
the class XponentsGeotagger method format.
private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
Representation result = null;
int tagCount = 0;
JSONObject resultContent = new JSONObject();
JSONObject resultMeta = new JSONObject();
resultMeta.put("status", "ok");
resultMeta.put("numfound", 0);
JSONArray resultArray = new JSONArray();
/*
* Super loop: Iterate through all found entities. record Taxons as
* person or orgs record Geo tags as country, place, or geo. geo =
* geocoded place or parsed coordinate (MGRS, DMS, etc)
*
*/
for (TextMatch name : matches) {
/*
* ==========================
* ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
* ==========================
*/
if (name instanceof TaxonMatch) {
if (jobParams.output_taxons) {
TaxonMatch match = (TaxonMatch) name;
++tagCount;
for (Taxon n : match.getTaxons()) {
JSONObject node = populateMatch(name);
String t = "taxon";
String taxon_name = n.name.toLowerCase();
if (taxon_name.startsWith("org.")) {
t = "org";
} else if (taxon_name.startsWith("person.")) {
t = "person";
}
node.put("type", t);
// Name of taxon
node.put("taxon", n.name);
// Name of catalog or source
node.put("catalog", n.catalog);
// node.put("filtered-out", true);
resultArray.put(node);
break;
}
}
continue;
}
// Ignore non-place tags
if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
continue;
}
JSONObject node = populateMatch(name);
/*
* ==========================
* ANNOTATIONS: coordinates
* ==========================
*/
if (name instanceof GeocoordMatch) {
++tagCount;
GeocoordMatch geo = (GeocoordMatch) name;
node.put("type", "coordinate");
Transforms.createGeocoding(geo, node);
resultArray.put(node);
continue;
}
if (name.isFilteredOut()) {
debug("Filtered out " + name.getText());
continue;
}
PlaceCandidate place = (PlaceCandidate) name;
Place resolvedPlace = place.getChosen();
/*
* ==========================
* ANNOTATIONS: countries, places, etc.
* ==========================
*/
/*
* Accept all country names as potential geotags Else if name can be
* filtered out, do it now. Otherwise it is a valid place name to
* consider
*/
++tagCount;
if (place.isCountry) {
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "country");
node.put("cc", resolvedPlace.getCountryCode());
node.put("confidence", place.getConfidence());
} else {
/*
* Conf = 20 or greater to be geocoded.
*/
Transforms.createGeocoding(resolvedPlace, node);
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "place");
node.put("confidence", place.getConfidence());
if (place.getConfidence() <= 10) {
node.put("filtered-out", true);
}
}
resultArray.put(node);
}
resultMeta.put("numfound", tagCount);
resultContent.put("response", resultMeta);
resultContent.put("annotations", resultArray);
result = new JsonRepresentation(resultContent.toString(2));
result.setCharacterSet(CharacterSet.UTF_8);
return result;
}
use of org.opensextant.data.Taxon in project Xponents by OpenSextant.
the class Transforms method parseTaxon.
/**
* Parse out a taxon from JSON/REST
* @param x a taxon object
* @param t type of taxon
* @param a JSON annotation
*/
public static void parseTaxon(TaxonMatch x, String t, JSONObject a) {
x.setText(a.getString("matchtext"));
if (a.has("taxon")) {
Taxon tx = new Taxon();
tx.setName(a.getString("taxon"));
tx.catalog = a.getString("catalog");
x.addTaxon(tx);
}
x.setType(t);
}
use of org.opensextant.data.Taxon in project Xponents by OpenSextant.
the class TaxonMatcher method extractorImpl.
/**
* Implementation details -- use with or without the formal ID/buffer
* pairing.
*
* @param id
* doc id
* @param buf
* input text
* @return list of matches
* @throws ExtractionException
*/
private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException {
List<TextMatch> matches = new ArrayList<TextMatch>();
String docid = (id != null ? id : NO_DOC_ID);
Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap);
@SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
log.debug("TAGS SIZE = {}", tags.size());
/*
* Retrieve all offsets into a long list.
*/
TaxonMatch m = null;
// int x1 = -1, x2 = -1;
int tag_count = 0;
String id_prefix = docid + "#";
for (NamedList<?> tag : tags) {
m = new TaxonMatch();
m.start = ((Integer) tag.get("startOffset")).intValue();
// +1 char after
m.end = ((Integer) tag.get("endOffset")).intValue();
// last matched
// m.pattern_id = "taxtag";
++tag_count;
m.match_id = id_prefix + tag_count;
// m.setText((String) tag.get("matchText")); // Not reliable.
// matchText can be null.
m.setText(buf.substring(m.start, m.end));
if (TextUtils.countFormattingSpace(m.getText()) > 1) {
// Phrase with a single TAB is okay
continue;
}
@SuppressWarnings("unchecked") List<Integer> taxonIDs = (List<Integer>) tag.get("ids");
for (Integer solrId : taxonIDs) {
Object refData = beanMap.get(solrId);
if (refData == null) {
continue;
}
/*
* Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO'
*/
Taxon tx = (Taxon) refData;
if (this.filterNonAcronyms) {
if (tx.isAcronym && !m.isUpper()) {
continue;
}
}
m.addTaxon(tx);
}
//
if (m.hasTaxons()) {
matches.add(m);
}
}
log.debug("FOUND LABELS count={}", matches.size());
return matches;
}
use of org.opensextant.data.Taxon in project Xponents by OpenSextant.
the class TaxonMatcher method createTaxon.
/**
* Parse the taxon reference data from a solr doc and return Taxon obj.
*
* @param refData
* solr doc
* @return taxon obj
*/
public static Taxon createTaxon(SolrDocument refData) {
Taxon label = new Taxon();
label.name = SolrProxy.getString(refData, "taxnode");
label.isAcronym = "A".equals(SolrProxy.getString(refData, "name_type"));
label.catalog = SolrProxy.getString(refData, "catalog");
label.addTerm(SolrProxy.getString(refData, "phrase"));
label.addTags(refData.getFieldValues("tag"));
return label;
}
Aggregations