Search in sources :

Example 1 with Place

use of org.opensextant.data.Place in project Xponents by OpenSextant.

the class Transforms method parseAnnotation.

/**
	 * Convert JSON object for an annotation into a Xponents TextMatch instance.
	 * Parsing data from JSON/REST representations has very limited capability compared to
	 * using Java API for processing routines directly.
	 * 
	 * @param data
	 * @return
	 * @throws JSONException
	 */
public static TextMatch parseAnnotation(Object data) throws JSONException {
    if (!(data instanceof JSONObject)) {
        return null;
    }
    TextMatch m = null;
    JSONObject a = (JSONObject) data;
    TaxonMatch x = null;
    String typ = a.getString("type");
    String text = a.getString("matchtext");
    switch(typ) {
        case "place":
            PlaceCandidate placeMatch = new PlaceCandidate();
            Place geo = new Place();
            placeMatch.setText(text);
            Transforms.parseGeocoding(geo, a);
            placeMatch.setConfidence(a.optInt("confidence", -1));
            placeMatch.choose(geo);
            m = placeMatch;
            break;
        case "coordinate":
            GeocoordMatch coord = new GeocoordMatch();
            Place coordLoc = new Place();
            coord.setText(text);
            // How awful:.... need to parse Coord directly
            Transforms.parseGeocoding(coordLoc, a);
            coord.setLatLon(coordLoc);
            coord.setMethod(coordLoc.getMethod());
            /* TODO: GeocoordMatch needs to support setters for Geocoding here.
			 * missing reverse geo info
			 * 
			 *  cc, adm1
			 *  
			 */
            m = coord;
            break;
        case "country":
            PlaceCandidate countryMatch = new PlaceCandidate();
            Place cc = new Place();
            countryMatch.setText(text);
            cc.setName(text);
            countryMatch.setConfidence(a.optInt("confidence", -1));
            cc.setCountryCode(a.getString("cc"));
            countryMatch.isCountry = true;
            countryMatch.choose(cc);
            m = countryMatch;
            break;
        case "person":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "person", a);
            m = x;
            break;
        case "org":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "org", a);
            m = x;
            break;
        case "taxon":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "taxon", a);
            m = x;
            break;
        default:
            throw new JSONException("Unknown Annotation " + typ);
    }
    m.setType(typ);
    m.start = a.getInt("offset");
    m.end = m.start + a.getInt("length");
    return m;
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) JSONObject(org.json.JSONObject) JSONException(org.json.JSONException) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 2 with Place

use of org.opensextant.data.Place in project Xponents by OpenSextant.

the class XponentsGeotagger method format.

private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
    Representation result = null;
    int tagCount = 0;
    JSONObject resultContent = new JSONObject();
    JSONObject resultMeta = new JSONObject();
    resultMeta.put("status", "ok");
    resultMeta.put("numfound", 0);
    JSONArray resultArray = new JSONArray();
    /*
		 * Super loop: Iterate through all found entities. record Taxons as
		 * person or orgs record Geo tags as country, place, or geo. geo =
		 * geocoded place or parsed coordinate (MGRS, DMS, etc)
		 * 
		 */
    for (TextMatch name : matches) {
        /*            
			 * ==========================
			 * ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
			 * ==========================             
			 */
        if (name instanceof TaxonMatch) {
            if (jobParams.output_taxons) {
                TaxonMatch match = (TaxonMatch) name;
                ++tagCount;
                for (Taxon n : match.getTaxons()) {
                    JSONObject node = populateMatch(name);
                    String t = "taxon";
                    String taxon_name = n.name.toLowerCase();
                    if (taxon_name.startsWith("org.")) {
                        t = "org";
                    } else if (taxon_name.startsWith("person.")) {
                        t = "person";
                    }
                    node.put("type", t);
                    // Name of taxon
                    node.put("taxon", n.name);
                    // Name of catalog or source
                    node.put("catalog", n.catalog);
                    // node.put("filtered-out", true);
                    resultArray.put(node);
                    break;
                }
            }
            continue;
        }
        // Ignore non-place tags
        if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
            continue;
        }
        JSONObject node = populateMatch(name);
        /*
			 * ==========================
			 * ANNOTATIONS: coordinates
			 * ==========================
			 */
        if (name instanceof GeocoordMatch) {
            ++tagCount;
            GeocoordMatch geo = (GeocoordMatch) name;
            node.put("type", "coordinate");
            Transforms.createGeocoding(geo, node);
            resultArray.put(node);
            continue;
        }
        if (name.isFilteredOut()) {
            debug("Filtered out " + name.getText());
            continue;
        }
        PlaceCandidate place = (PlaceCandidate) name;
        Place resolvedPlace = place.getChosen();
        /*
			 * ==========================
			 * ANNOTATIONS: countries, places, etc.
			 * ==========================
			 */
        /*
			 * Accept all country names as potential geotags Else if name can be
			 * filtered out, do it now. Otherwise it is a valid place name to
			 * consider
			 */
        ++tagCount;
        if (place.isCountry) {
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "country");
            node.put("cc", resolvedPlace.getCountryCode());
            node.put("confidence", place.getConfidence());
        } else {
            /*
				 * Conf = 20 or greater to be geocoded.
				 */
            Transforms.createGeocoding(resolvedPlace, node);
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "place");
            node.put("confidence", place.getConfidence());
            if (place.getConfidence() <= 10) {
                node.put("filtered-out", true);
            }
        }
        resultArray.put(node);
    }
    resultMeta.put("numfound", tagCount);
    resultContent.put("response", resultMeta);
    resultContent.put("annotations", resultArray);
    result = new JsonRepresentation(resultContent.toString(2));
    result.setCharacterSet(CharacterSet.UTF_8);
    return result;
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) JSONObject(org.json.JSONObject) Taxon(org.opensextant.data.Taxon) JSONArray(org.json.JSONArray) Representation(org.restlet.representation.Representation) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 3 with Place

use of org.opensextant.data.Place in project Xponents by OpenSextant.

the class LocationChooserRule method evaluate.

/**
     * Walk the entire list.
     */
public void evaluate(List<PlaceCandidate> names) {
    // INPUTS: 
    //    histogram of country mentions
    //    resolved/relevant provinces (PlaceEvidence)
    //    resolved/relevant locations attached to places (PlaceEvidence)
    // 
    // MEASURES:  
    //    # of distinct countries == density, focus.  Is this document about one or two countries, 
    //    or is it a world news report on everything.
    //
    countryContext = countryObserver.countryMentionCount();
    boundaryContext = boundaryObserver.placeMentionCount();
    /* TODO:  DEBUG through location chooser using histograms 
         * of found and resolved place metadata.
         * 
         */
    if (log.isDebugEnabled()) {
        debuggingHistograms(names);
    }
    for (PlaceCandidate name : names) {
        if (name.isFilteredOut() || name.isCountry) {
            continue;
        }
        if (name.getChosen() != null) {
            // DONE
            continue;
        }
        // 
        for (Place geo : name.getPlaces()) {
            evaluate(name, geo);
        }
        name.choose();
        if (name.getChosen() != null) {
            this.assessConfidence(name);
            documentResolvedLocations.put(name.getTextnorm(), name.getChosen());
        } else {
            log.info("Place name is ambiguous: {} in N={} places", name.getText(), name.distinctLocationCount());
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 4 with Place

use of org.opensextant.data.Place in project Xponents by OpenSextant.

the class NameCodeRule method evaluate.

/**
     * Requirement: List of place candidate is a linked list.
     */
@Override
public void evaluate(final List<PlaceCandidate> names) {
    for (int x = 0; x < names.size() - 1; ++x) {
        PlaceCandidate name = names.get(x);
        PlaceCandidate code = names.get(x + 1);
        if (name.isFilteredOut() || code.isFilteredOut()) {
            continue;
        }
        /*
             * COUNTRY, STATE is not supported under this rule.
             * E.g., Uruguay, Argentina ... This looks like a list of countries
             * However Uruguay is a district in Argentina; Just as Georgia is a state in US
             * and also a country name.
             */
        if (name.isCountry) {
            continue;
        }
        /*
             * Test if SOMENAME, CODE is the case. a1.....a2.b1.., where b1 > a2
             * > a1, but distance is minimal from end of name to start of code.
             *
             */
        if ((code.start - name.end) > MAX_CHAR_DIST) {
            continue;
        }
        /*
             * Not supporting lowercase codes/abbreviations.  'la', 'is', 'un', etc.
             */
        if (code.isLower() && code.getText().length() < 4) {
            continue;
        }
        boolean comma = false;
        if (name.getPostmatchTokens() != null) {
            // Proximity is one factor, but conventional format should weigh more.
            if (",".equals(name.getPostmatchTokens()[0])) {
                comma = true;
            }
        }
        /*
             * by this point a place name tag should be marked as a name or
             * code/abbrev. Match the abbreviation with a geographic location
             * that is a state, county, district, etc.
             */
        Place country = code.isCountry ? code.getChosen() : null;
        log.debug("{} name, code: {} in {}?", NAME, name.getText(), code.getText());
        for (Place geo : code.getPlaces()) {
            if (!geo.isAdministrative() || geo.getCountryCode() == null) {
                continue;
            }
            // Provinces, states, districts, etc. Only. 
            //
            // Make sure you can match an province name or code with the gazetteer entries found:
            //   Boston, Ma.  ==== for 'Ma', resolve to an abbreviation for Massachusetts
            //                     Ignore places called 'Ma'
            // 
            // Place ('Ma') == will have gazetteer metadata indicating if this is a valid abbreviated code for a place. 
            // PlaceCandidate('Ma.') will have textual metadata from given text indicating if it is a code, MA, or abbrev. 'Ma.'
            // 
            // These two situations must match here.   We ignore geo locations that do not fit this profile.
            // 
            boolean lexicalMatch = ((code.isAbbreviation && geo.isAbbreviation()) || (!code.isAbbreviation && !geo.isAbbreviation()));
            // 
            if (!lexicalMatch) {
                continue;
            }
            String adm1 = geo.getHierarchicalPath();
            if (adm1 == null && !code.isCountry) {
                log.debug("ADM1 hierarchical path should not be null");
                continue;
            }
            // Quick determination if these two places have a containment or geopolitical connection
            //                 
            boolean contains = name.presentInHierarchy(adm1) || (country != null ? name.presentInCountry(country.getCountryCode()) : false);
            if (!contains) {
                continue;
            }
            /*   CITY, STATE
                 *   CITY, COUNTRY
                 */
            // Associate the CODE to the NAME that precedes it.
            // 
            PlaceEvidence ev = new PlaceEvidence();
            ev.setCountryCode(geo.getCountryCode());
            ev.setAdmin1(geo.getAdmin1());
            // Shunt. Evaluate this rule here.
            ev.setEvaluated(true);
            int wt = weight + (comma ? 2 : 0);
            if (geo.isAbbreviation() && (code.isAbbreviation || code.isAcronym)) {
                ev.setRule(NAME_ADMCODE_RULE);
                ev.setWeight(wt + 1);
            } else {
                ev.setRule(NAME_ADMNAME_RULE);
                ev.setWeight(wt);
            }
            name.addEvidence(ev);
            if (boundaryObserver != null) {
                boundaryObserver.boundaryLevel1InScope(geo);
            }
            // 
            for (Place nameGeo : name.getPlaces()) {
                if (!(nameGeo.isPopulated() || nameGeo.isAdministrative() || nameGeo.isSpot())) {
                    continue;
                }
                if (adm1 != null && adm1.equals(nameGeo.getHierarchicalPath())) {
                    name.incrementPlaceScore(nameGeo, ev.getWeight());
                } else if (sameCountry(nameGeo, country)) {
                    name.incrementPlaceScore(nameGeo, ev.getWeight());
                }
            }
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceEvidence(org.opensextant.extractors.geo.PlaceEvidence) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 5 with Place

use of org.opensextant.data.Place in project Xponents by OpenSextant.

the class TestGazetteer method main.

/**
     * Do a basic test -- This main prog makes use of the default JVM arg for solr:  -Dopensextant.solr=/path/to/solr
     *
     *
     * @param args the arguments
     * @throws Exception the exception
     */
public static void main(String[] args) {
    try {
        GeonamesUtility geodataUtil = new GeonamesUtility();
        Country aus = geodataUtil.getCountry("AUS");
        System.out.println("Got Australia..." + aus);
        gaz = new SolrGazetteer();
        // Try to get countries
        Map<String, Country> countries = gaz.getCountries();
        for (Country c : countries.values()) {
            System.out.println(c.getKey() + " = " + c.name + "\t  Aliases: " + c.getAliases().toString());
        }
        /*
             * This test organizes country names to see if there are any country names
             * that are unique.
             */
        List<String> cnames = new ArrayList<>();
        Map<String, Boolean> done = new TreeMap<>();
        for (Country C : geodataUtil.getCountries()) {
            String q = String.format("name:%s AND -feat_code:PCL* AND -feat_code:TERR", C.getName());
            List<Place> country_name_matches = gaz.search(q, true);
            //System.out.println("Matched names for " + C.getName() + ":\t");
            String cname = TextUtils.removeDiacritics(C.getName()).toLowerCase();
            done.put(cname, false);
            for (Place p : country_name_matches) {
                String pname = TextUtils.removeDiacritics(p.getName());
                if (pname.equalsIgnoreCase(cname)) {
                    done.put(p.getName().toLowerCase(), true);
                }
            }
            cnames.add(cname);
        }
        //Collections.sort(cnames);
        for (String cname : done.keySet()) {
            System.out.println(String.format("\"%s\", Has Duplicates:", cname) + done.get(cname));
        }
        // US
        testPlacesAt(44, -118, 25, /*km*/
        "P");
        // East Asia
        testPlacesAt(44, 118, 100, /*km*/
        "A");
        // Europe
        testPlacesAt(44, 0, 250, /*km*/
        "A");
        // Europe
        testPlacesAt(44, 0, 10, /*km*/
        "P");
    } catch (Exception err) {
        err.printStackTrace();
    } finally {
        gaz.shutdown();
        System.exit(0);
    }
}
Also used : SolrGazetteer(org.opensextant.extractors.geo.SolrGazetteer) GeonamesUtility(org.opensextant.util.GeonamesUtility) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) SolrServerException(org.apache.solr.client.solrj.SolrServerException) Country(org.opensextant.data.Country) Place(org.opensextant.data.Place)

Aggregations

Place (org.opensextant.data.Place)25 PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)8 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 Test (org.junit.Test)3 LatLon (org.opensextant.data.LatLon)3 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)3 BufferedReader (java.io.BufferedReader)2 InputStreamReader (java.io.InputStreamReader)2 ParseException (java.text.ParseException)2 HashMap (java.util.HashMap)2 SolrServerException (org.apache.solr.client.solrj.SolrServerException)2 JSONObject (org.json.JSONObject)2 Country (org.opensextant.data.Country)2 TextMatch (org.opensextant.extraction.TextMatch)2 PlaceCount (org.opensextant.extractors.geo.PlaceCount)2 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)2 GeonamesUtility (org.opensextant.util.GeonamesUtility)2 Reader (java.io.Reader)1 Date (java.util.Date)1