Search in sources :

Example 6 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class PersonNameFilter method evaluateNamedEntities.

/**
     * Use known person names to distinguish well-known persons that may or may
     * not overlap in in the text and the namespace.
     * 
     * <pre>
     * Hillary Clinton visited New York state today.
     * </pre>
     * 
     * So, Clinton is part of a well known celebrity, and is not referring to
     * Clinton, NY a town in upstate. We identify all such person names and mark
     * any overlaps and co-references that coincide with tagged place names.
     * 
     * @param placeNames
     *            places to NEgate
     * @param persons
     *            named persons in doc
     * @param orgs
     *            named orgs in doc
     */
public void evaluateNamedEntities(final List<PlaceCandidate> placeNames, final List<TaxonMatch> persons, final List<TaxonMatch> orgs) {
    for (PlaceCandidate pc : placeNames) {
        if (pc.isFilteredOut() || pc.isCountry) {
            continue;
        }
        // person/celebrity
        if (resolvedPersons.containsKey(pc.getTextnorm())) {
            pc.setFilteredOut(true);
            pc.addRule("ResolvedPerson");
            continue;
        }
        if (resolvedOrgs.containsKey(pc.getTextnorm())) {
            pc.setFilteredOut(true);
            pc.addRule("ResolvedOrg");
            continue;
        }
        for (TaxonMatch name : persons) {
            // place name)
            if (pc.isWithin(name)) {
                pc.setFilteredOut(true);
                resolvedPersons.put(pc.getTextnorm(), name.getText());
                pc.addRule("ResolvedPerson");
            }
        }
        for (TaxonMatch name : orgs) {
            if (pc.isSameMatch(name)) {
                // Org is 'name'
                //  where  name is a city 
                pc.setFilteredOut(true);
                resolvedOrgs.put(pc.getTextnorm(), name.getText());
                pc.addRule("ResolvedOrg");
            } else {
                if (pc.isWithin(name) && !pc.isCountry) {
                    // 
                    if (!pc.getTextnorm().contains(" ")) {
                        //  Org is 'text name text'
                        //  where  name is a city, and 'name' is a single word.
                        pc.setFilteredOut(true);
                        // Do not record such instances as resolved orgs, because if the name occurs on its own
                        // then it is likely the locality/city in which that organization exists.
                        // "Detroit City Council" -- an org.  Filter out just hit instance.
                        // "Detroit"   -- mentioned later in the same doc, not an org.
                        // 
                        //resolvedOrgs.put(pc.getTextnorm(), name.getText());
                        pc.addRule(NAME_IN_ORG_RULE);
                    }
                }
            }
        }
    }
}
Also used : TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 7 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class ContextualOrganizationRule method evaluate.

@Override
public void evaluate(List<PlaceCandidate> names) {
    if (!isRelevant()) {
        return;
    }
    for (PlaceCandidate name : names) {
        if (!name.hasRule(PersonNameFilter.NAME_IN_ORG_RULE)) {
            continue;
        }
        log.debug(" City Name in Org Name? {}", name);
        if (!name.isFilteredOut()) {
            continue;
        }
        // mentioned elsewhere in document.
        for (Place geo : name.getPlaces()) {
            if (boundaryObserver.placeMentionCount().containsKey(geo.getHierarchicalPath())) {
                name.setFilteredOut(false);
                name.addRule("ContextualOrg");
                reEval.add(name.getTextnorm());
                continue;
            }
        }
    }
    /* Re-evaluate items that may have been filtered because the name appeared in an organization
         * name where the org name was not necessarily geographically relevant until now.
         *
         */
    for (PlaceCandidate name : names) {
        if (name.isFilteredOut() && reEval.contains(name.getTextnorm())) {
            name.setFilteredOut(false);
            name.addRule("ContextualOrg.Relation");
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 8 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class LocationChooserRule method debuggingHistograms.

/**
     * What can we learn from assembling better stats at the document level?
     * Evidence breaks down into concrete locations vs. inferred.
     * 
     * @param names
     */
private void debuggingHistograms(List<PlaceCandidate> names) {
    /*
         * TODO:  Is this histogram helpful.?
         * 
         * Uniqueness or popularity of a given name.
         */
    for (PlaceCandidate name : names) {
        if (name.isFilteredOut()) {
            continue;
        }
        PlaceCount x = namespace.get(name.getTextnorm());
        if (x == null) {
            x = new PlaceCount();
            x.place = new Place(name.getTextnorm(), name.getTextnorm());
            x.total = names.size();
            namespace.put(name.getTextnorm(), x);
        } else {
            ++x.count;
        }
    }
    for (String cc : countryContext.keySet()) {
        CountryCount count = countryContext.get(cc);
        //log.debug("Country: {}/{} ({})", cc, count.country, count.count);
        log.debug("Country: {}", count);
    }
    for (PlaceCount count : boundaryContext.values()) {
        //log.debug("Boundary: {} ({})", count.place, count.count);
        log.debug("Boundary: {}", count);
        String cc = count.place.getCountryCode();
        CountryCount Ccnt = inferredCountries.get(cc);
        if (Ccnt == null) {
            Ccnt = new CountryCount();
            Ccnt.country = new Country(cc, cc);
            inferredCountries.put(cc, Ccnt);
        } else {
            ++Ccnt.count;
        }
    }
    log.debug("Places: {}/{}", namespace.size(), namespace);
}
Also used : PlaceCount(org.opensextant.extractors.geo.PlaceCount) CountryCount(org.opensextant.extractors.geo.CountryCount) Country(org.opensextant.data.Country) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 9 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class NameRule method evaluate.

public void evaluate(List<PlaceCandidate> names) {
    for (PlaceCandidate name : names) {
        /*
             * This was filtered out already so ignore.
             */
        if (name.isFilteredOut() || name.getChosen() != null) {
            continue;
        }
        if (name.getTextnorm().length() < 10) {
            continue;
        }
        String[] words = name.getTextnorm().split(" ");
        boolean isPlace = P_prefixes.contains(words[0]);
        boolean isAdmin1 = A1_suffixes.contains(words[words.length - 1]);
        boolean isAdmin2 = A2_suffixes.contains(words[words.length - 1]);
        if (!isPlace && !isAdmin1 && !isAdmin2) {
            // rule does not apply
            continue;
        }
        for (Place geo : name.getPlaces()) {
            if (filterOutBySize(name, geo)) {
                continue;
            }
            if (isPlace && geo.isPopulated()) {
                name.addRule(CITY);
                name.incrementPlaceScore(geo, 1.0);
            } else if (isAdmin1 && geo.isAdmin1()) {
                name.addRule(ADM1);
                name.incrementPlaceScore(geo, 1.0);
            } else if (isAdmin2 && geo.isAdministrative()) {
                name.addRule(ADM2);
                name.incrementPlaceScore(geo, 1.0);
            }
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 10 with PlaceCandidate

use of org.opensextant.extractors.geo.PlaceCandidate in project Xponents by OpenSextant.

the class NonsenseFilter method evaluate.

/**
     * Evaluate the name in each list of names.
     * 
     * <pre>
     * doo doo      - FAIL
     * St. Paul     - PASS
     * south"  bend - FAIL
     * </pre>
     */
@Override
public void evaluate(List<PlaceCandidate> names) {
    for (PlaceCandidate p : names) {
        /*
             * is Nonsense?
             * For phrases upto MAX chars long:
             * + does it contain irregular punctuation?
             *   //  "...in the south. Bend it backwards...";  
             *   // South Bend is not intended there.
             *  
             * + does it contain a repeated syllable or word?:
             *   // "doo doo", "bah bah" "to to"
             */
        if (p.getLength() > MAX_NONSENSE_PHRASE_LEN) {
            continue;
        }
        /*
             * Short words, with numerics. Approximately one word.
             */
        if (p.getLength() < GENERIC_ONE_WORD) {
            if (trivialNumerics.matcher(p.getText()).matches()) {
                p.setFilteredOut(true);
                p.addRule("Nonsense,Numbers");
                continue;
            }
        }
        if (irregularPunctPatterns(p.getText())) {
            p.setFilteredOut(true);
            p.addRule("Nonsense,Punct");
            continue;
        }
        if (p.isLower()) {
            String[] wds = tokenizer.split(p.getTextnorm());
            HashSet<String> set = new HashSet<>();
            for (String w : wds) {
                if (set.contains(w)) {
                    p.setFilteredOut(true);
                    p.addRule("Nonsense,Repeated,Lower");
                    break;
                }
                set.add(w);
            }
        //continue;
        }
        /*
             * Still here? Check for short obscure matches where diacritics mismatch.
             * Cannot eliminate a candidate based on a single location. But reduce score for those that 
             * mismatch severely.  
             * NOTE: Score on each geo location is accounted for in default score. I.E., edit distance between text match and geo name. 
             */
        if (p.getLength() <= GENERIC_ONE_WORD) {
            boolean hasValidGeo = false;
            String ph1 = phoneticRedux(p.getTextnorm());
            String diacriticRule = null;
            log.debug("Testing phrase {} phonetic:{}", p.getTextnorm(), ph1);
            for (Place geo : p.getPlaces()) {
                log.debug("\tPLACE={}, {}", geo, geo.getNamenorm());
                boolean geoDiacritics = TextUtils.hasDiacritics(geo.getPlaceName());
                if (geoDiacritics && p.hasDiacritics) {
                    hasValidGeo = true;
                    diacriticRule = "Matched-Diacritics";
                    break;
                }
                if (!geoDiacritics && !p.hasDiacritics) {
                    hasValidGeo = true;
                    // both ASCII? not worth tracking.
                    break;
                }
                /* Pattern: Official name has accented/emphasis markings on the name, such as:
                     *     `NAME   or NAME`
                     * Where NAME is some Latin transliteration of non-Latin script    
                     */
                if (geo.getNamenorm().contains(p.getTextnorm())) {
                    hasValidGeo = true;
                    diacriticRule = "Location-Contains-Name";
                    break;
                }
                if (isPhoneticMatch(ph1, geo.getNamenorm())) {
                    hasValidGeo = true;
                    diacriticRule = "Matched-Phonetic";
                    break;
                }
                log.debug("\t{} != {}", p.getTextnorm(), geo.getNamenorm());
            }
            if (!hasValidGeo) {
                p.setFilteredOut(true);
                p.addRule("Nonsense,Mismatched,Diacritic");
            } else if (diacriticRule != null) {
                p.addRule(diacriticRule);
            }
        }
    }
}
Also used : Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate) HashSet(java.util.HashSet)

Aggregations

PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)13 Place (org.opensextant.data.Place)8 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)4 TextMatch (org.opensextant.extraction.TextMatch)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 URL (java.net.URL)2 JSONObject (org.json.JSONObject)2 HashSet (java.util.HashSet)1 TreeSet (java.util.TreeSet)1 JSONObject (net.sf.json.JSONObject)1 JSONArray (org.json.JSONArray)1 JSONException (org.json.JSONException)1 Test (org.junit.Test)1 ConfigException (org.opensextant.ConfigException)1 Country (org.opensextant.data.Country)1 Taxon (org.opensextant.data.Taxon)1 MatchFilter (org.opensextant.extraction.MatchFilter)1 CountryCount (org.opensextant.extractors.geo.CountryCount)1 GazetteerMatcher (org.opensextant.extractors.geo.GazetteerMatcher)1 PlaceCount (org.opensextant.extractors.geo.PlaceCount)1