Search in sources :

Example 1 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class ContextualOrganizationRule method evaluate.

@Override
public void evaluate(List<PlaceCandidate> names) {
    if (!isRelevant()) {
        return;
    }
    for (PlaceCandidate name : names) {
        if (!name.hasRule(PersonNameFilter.NAME_IN_ORG_RULE)) {
            continue;
        }
        log.debug(" City Name in Org Name? {}", name);
        if (!name.isFilteredOut()) {
            continue;
        }
        // mentioned elsewhere in document.
        for (Place geo : name.getPlaces()) {
            if (boundaryObserver.placeMentionCount().containsKey(geo.getHierarchicalPath())) {
                name.setFilteredOut(false);
                name.addRule("ContextualOrg");
                reEval.add(name.getTextnorm());
                continue;
            }
        }
    }
    /* Re-evaluate items that may have been filtered because the name appeared in an organization
         * name where the org name was not necessarily geographically relevant until now.
         *
         */
    for (PlaceCandidate name : names) {
        if (name.isFilteredOut() && reEval.contains(name.getTextnorm())) {
            name.setFilteredOut(false);
            name.addRule("ContextualOrg.Relation");
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 2 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class Transforms method parseAnnotation.

/**
	 * Convert JSON object for an annotation into a Xponents TextMatch instance.
	 * Parsing data from JSON/REST representations has very limited capability compared to
	 * using Java API for processing routines directly.
	 * 
	 * @param data
	 * @return
	 * @throws JSONException
	 */
public static TextMatch parseAnnotation(Object data) throws JSONException {
    if (!(data instanceof JSONObject)) {
        return null;
    }
    TextMatch m = null;
    JSONObject a = (JSONObject) data;
    TaxonMatch x = null;
    String typ = a.getString("type");
    String text = a.getString("matchtext");
    switch(typ) {
        case "place":
            PlaceCandidate placeMatch = new PlaceCandidate();
            Place geo = new Place();
            placeMatch.setText(text);
            Transforms.parseGeocoding(geo, a);
            placeMatch.setConfidence(a.optInt("confidence", -1));
            placeMatch.choose(geo);
            m = placeMatch;
            break;
        case "coordinate":
            GeocoordMatch coord = new GeocoordMatch();
            Place coordLoc = new Place();
            coord.setText(text);
            // How awful:.... need to parse Coord directly
            Transforms.parseGeocoding(coordLoc, a);
            coord.setLatLon(coordLoc);
            coord.setMethod(coordLoc.getMethod());
            /* TODO: GeocoordMatch needs to support setters for Geocoding here.
			 * missing reverse geo info
			 * 
			 *  cc, adm1
			 *  
			 */
            m = coord;
            break;
        case "country":
            PlaceCandidate countryMatch = new PlaceCandidate();
            Place cc = new Place();
            countryMatch.setText(text);
            cc.setName(text);
            countryMatch.setConfidence(a.optInt("confidence", -1));
            cc.setCountryCode(a.getString("cc"));
            countryMatch.isCountry = true;
            countryMatch.choose(cc);
            m = countryMatch;
            break;
        case "person":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "person", a);
            m = x;
            break;
        case "org":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "org", a);
            m = x;
            break;
        case "taxon":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "taxon", a);
            m = x;
            break;
        default:
            throw new JSONException("Unknown Annotation " + typ);
    }
    m.setType(typ);
    m.start = a.getInt("offset");
    m.end = m.start + a.getInt("length");
    return m;
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) JSONObject(org.json.JSONObject) JSONException(org.json.JSONException) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 3 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class XponentsGeotagger method format.

private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
    Representation result = null;
    int tagCount = 0;
    JSONObject resultContent = new JSONObject();
    JSONObject resultMeta = new JSONObject();
    resultMeta.put("status", "ok");
    resultMeta.put("numfound", 0);
    JSONArray resultArray = new JSONArray();
    /*
		 * Super loop: Iterate through all found entities. record Taxons as
		 * person or orgs record Geo tags as country, place, or geo. geo =
		 * geocoded place or parsed coordinate (MGRS, DMS, etc)
		 * 
		 */
    for (TextMatch name : matches) {
        /*            
			 * ==========================
			 * ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
			 * ==========================             
			 */
        if (name instanceof TaxonMatch) {
            if (jobParams.output_taxons) {
                TaxonMatch match = (TaxonMatch) name;
                ++tagCount;
                for (Taxon n : match.getTaxons()) {
                    JSONObject node = populateMatch(name);
                    String t = "taxon";
                    String taxon_name = n.name.toLowerCase();
                    if (taxon_name.startsWith("org.")) {
                        t = "org";
                    } else if (taxon_name.startsWith("person.")) {
                        t = "person";
                    }
                    node.put("type", t);
                    // Name of taxon
                    node.put("taxon", n.name);
                    // Name of catalog or source
                    node.put("catalog", n.catalog);
                    // node.put("filtered-out", true);
                    resultArray.put(node);
                    break;
                }
            }
            continue;
        }
        // Ignore non-place tags
        if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
            continue;
        }
        JSONObject node = populateMatch(name);
        /*
			 * ==========================
			 * ANNOTATIONS: coordinates
			 * ==========================
			 */
        if (name instanceof GeocoordMatch) {
            ++tagCount;
            GeocoordMatch geo = (GeocoordMatch) name;
            node.put("type", "coordinate");
            Transforms.createGeocoding(geo, node);
            resultArray.put(node);
            continue;
        }
        if (name.isFilteredOut()) {
            debug("Filtered out " + name.getText());
            continue;
        }
        PlaceCandidate place = (PlaceCandidate) name;
        Place resolvedPlace = place.getChosen();
        /*
			 * ==========================
			 * ANNOTATIONS: countries, places, etc.
			 * ==========================
			 */
        /*
			 * Accept all country names as potential geotags Else if name can be
			 * filtered out, do it now. Otherwise it is a valid place name to
			 * consider
			 */
        ++tagCount;
        if (place.isCountry) {
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "country");
            node.put("cc", resolvedPlace.getCountryCode());
            node.put("confidence", place.getConfidence());
        } else {
            /*
				 * Conf = 20 or greater to be geocoded.
				 */
            Transforms.createGeocoding(resolvedPlace, node);
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "place");
            node.put("confidence", place.getConfidence());
            if (place.getConfidence() <= 10) {
                node.put("filtered-out", true);
            }
        }
        resultArray.put(node);
    }
    resultMeta.put("numfound", tagCount);
    resultContent.put("response", resultMeta);
    resultContent.put("annotations", resultArray);
    result = new JsonRepresentation(resultContent.toString(2));
    result.setCharacterSet(CharacterSet.UTF_8);
    return result;
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) JSONObject(org.json.JSONObject) Taxon(org.opensextant.data.Taxon) JSONArray(org.json.JSONArray) Representation(org.restlet.representation.Representation) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 4 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class LocationChooserRule method evaluate.

/**
     * Walk the entire list.
     */
public void evaluate(List<PlaceCandidate> names) {
    // INPUTS: 
    //    histogram of country mentions
    //    resolved/relevant provinces (PlaceEvidence)
    //    resolved/relevant locations attached to places (PlaceEvidence)
    // 
    // MEASURES:  
    //    # of distinct countries == density, focus.  Is this document about one or two countries, 
    //    or is it a world news report on everything.
    //
    countryContext = countryObserver.countryMentionCount();
    boundaryContext = boundaryObserver.placeMentionCount();
    /* TODO:  DEBUG through location chooser using histograms 
         * of found and resolved place metadata.
         * 
         */
    if (log.isDebugEnabled()) {
        debuggingHistograms(names);
    }
    for (PlaceCandidate name : names) {
        if (name.isFilteredOut() || name.isCountry) {
            continue;
        }
        if (name.getChosen() != null) {
            // DONE
            continue;
        }
        // 
        for (Place geo : name.getPlaces()) {
            evaluate(name, geo);
        }
        name.choose();
        if (name.getChosen() != null) {
            this.assessConfidence(name);
            documentResolvedLocations.put(name.getTextnorm(), name.getChosen());
        } else {
            log.info("Place name is ambiguous: {} in N={} places", name.getText(), name.distinctLocationCount());
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 5 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class NameCodeRule method evaluate.

/**
     * Requirement: List of place candidate is a linked list.
     */
@Override
public void evaluate(final List<PlaceCandidate> names) {
    for (int x = 0; x < names.size() - 1; ++x) {
        PlaceCandidate name = names.get(x);
        PlaceCandidate code = names.get(x + 1);
        if (name.isFilteredOut() || code.isFilteredOut()) {
            continue;
        }
        /*
             * COUNTRY, STATE is not supported under this rule.
             * E.g., Uruguay, Argentina ... This looks like a list of countries
             * However Uruguay is a district in Argentina; Just as Georgia is a state in US
             * and also a country name.
             */
        if (name.isCountry) {
            continue;
        }
        /*
             * Test if SOMENAME, CODE is the case. a1.....a2.b1.., where b1 > a2
             * > a1, but distance is minimal from end of name to start of code.
             *
             */
        if ((code.start - name.end) > MAX_CHAR_DIST) {
            continue;
        }
        /*
             * Not supporting lowercase codes/abbreviations.  'la', 'is', 'un', etc.
             */
        if (code.isLower() && code.getText().length() < 4) {
            continue;
        }
        boolean comma = false;
        if (name.getPostmatchTokens() != null) {
            // Proximity is one factor, but conventional format should weigh more.
            if (",".equals(name.getPostmatchTokens()[0])) {
                comma = true;
            }
        }
        /*
             * by this point a place name tag should be marked as a name or
             * code/abbrev. Match the abbreviation with a geographic location
             * that is a state, county, district, etc.
             */
        Place country = code.isCountry ? code.getChosen() : null;
        log.debug("{} name, code: {} in {}?", NAME, name.getText(), code.getText());
        for (Place geo : code.getPlaces()) {
            if (!geo.isAdministrative() || geo.getCountryCode() == null) {
                continue;
            }
            // Provinces, states, districts, etc. Only. 
            //
            // Make sure you can match an province name or code with the gazetteer entries found:
            //   Boston, Ma.  ==== for 'Ma', resolve to an abbreviation for Massachusetts
            //                     Ignore places called 'Ma'
            // 
            // Place ('Ma') == will have gazetteer metadata indicating if this is a valid abbreviated code for a place. 
            // PlaceCandidate('Ma.') will have textual metadata from given text indicating if it is a code, MA, or abbrev. 'Ma.'
            // 
            // These two situations must match here.   We ignore geo locations that do not fit this profile.
            // 
            boolean lexicalMatch = ((code.isAbbreviation && geo.isAbbreviation()) || (!code.isAbbreviation && !geo.isAbbreviation()));
            // 
            if (!lexicalMatch) {
                continue;
            }
            String adm1 = geo.getHierarchicalPath();
            if (adm1 == null && !code.isCountry) {
                log.debug("ADM1 hierarchical path should not be null");
                continue;
            }
            // Quick determination if these two places have a containment or geopolitical connection
            //                 
            boolean contains = name.presentInHierarchy(adm1) || (country != null ? name.presentInCountry(country.getCountryCode()) : false);
            if (!contains) {
                continue;
            }
            /*   CITY, STATE
                 *   CITY, COUNTRY
                 */
            // Associate the CODE to the NAME that precedes it.
            // 
            PlaceEvidence ev = new PlaceEvidence();
            ev.setCountryCode(geo.getCountryCode());
            ev.setAdmin1(geo.getAdmin1());
            // Shunt. Evaluate this rule here.
            ev.setEvaluated(true);
            int wt = weight + (comma ? 2 : 0);
            if (geo.isAbbreviation() && (code.isAbbreviation || code.isAcronym)) {
                ev.setRule(NAME_ADMCODE_RULE);
                ev.setWeight(wt + 1);
            } else {
                ev.setRule(NAME_ADMNAME_RULE);
                ev.setWeight(wt);
            }
            name.addEvidence(ev);
            if (boundaryObserver != null) {
                boundaryObserver.boundaryLevel1InScope(geo);
            }
            // 
            for (Place nameGeo : name.getPlaces()) {
                if (!(nameGeo.isPopulated() || nameGeo.isAdministrative() || nameGeo.isSpot())) {
                    continue;
                }
                if (adm1 != null && adm1.equals(nameGeo.getHierarchicalPath())) {
                    name.incrementPlaceScore(nameGeo, ev.getWeight());
                } else if (sameCountry(nameGeo, country)) {
                    name.incrementPlaceScore(nameGeo, ev.getWeight());
                }
            }
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceEvidence(org.opensextant.extractors.geo.PlaceEvidence) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Aggregations

PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)13 Place (org.opensextant.data.Place)8 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)4 TextMatch (org.opensextant.extraction.TextMatch)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 URL (java.net.URL)2 JSONObject (org.json.JSONObject)2 HashSet (java.util.HashSet)1 TreeSet (java.util.TreeSet)1 JSONObject (net.sf.json.JSONObject)1 JSONArray (org.json.JSONArray)1 JSONException (org.json.JSONException)1 Test (org.junit.Test)1 ConfigException (org.opensextant.ConfigException)1 Country (org.opensextant.data.Country)1 Taxon (org.opensextant.data.Taxon)1 MatchFilter (org.opensextant.extraction.MatchFilter)1 CountryCount (org.opensextant.extractors.geo.CountryCount)1 GazetteerMatcher (org.opensextant.extractors.geo.GazetteerMatcher)1 PlaceCount (org.opensextant.extractors.geo.PlaceCount)1