Search in sources :

Example 1 with Taxon

use of org.opensextant.data.Taxon in project Xponents by OpenSextant.

the class PlaceGeocoder method parseKnownNonPlaces.

/**
     * If no geo matches are found, we still parse the data if person name matching is enabled.
     * Poor-man's named-entity extraction
     * 
     * @throws ExtractionException
     * 
     */
private void parseKnownNonPlaces(TextInput input, List<PlaceCandidate> candidates, List<TextMatch> matches) {
    if (!isPersonNameMatchingEnabled()) {
        return;
    }
    // If this step fails miserably, do not raise error. Log the error and return nothing found.
    // 
    List<TextMatch> nonPlaces = null;
    try {
        nonPlaces = personMatcher.extract(input.buffer);
        if (nonPlaces.isEmpty()) {
            return;
        }
    } catch (Exception err) {
        log.error(err.getMessage());
        return;
    }
    List<TaxonMatch> persons = new ArrayList<>();
    List<TaxonMatch> orgs = new ArrayList<>();
    log.debug("Matched {}", nonPlaces.size());
    for (TextMatch tm : nonPlaces) {
        if (!(tm instanceof TaxonMatch)) {
            continue;
        }
        TaxonMatch tag = (TaxonMatch) tm;
        //
        // For the purposes of geocoding/geoparsing filter out ALL
        // TaxonMatches. Any place names should reside back in
        // gazetteer. If XTax does have place or location data, that would be new.
        //
        tm.setFilteredOut(true);
        for (Taxon taxon : tag.getTaxons()) {
            String node = taxon.name.toLowerCase();
            // name spans that are not places.
            if (node.startsWith("person.")) {
                persons.add(tag);
                break;
            } else if (node.startsWith("org.")) {
                if (taxon.isAcronym && !tm.isUpper()) {
                    continue;
                }
                orgs.add(tag);
                break;
            } else if (node.startsWith("nationality.")) {
                persons.add(tag);
                // The tag may be absent as some ethnicities may be mixed in and indicate no country.
                for (String t : taxon.tagset) {
                    int x = t.indexOf("cc+");
                    if (x >= 0) {
                        String isocode = t.substring(x + 3);
                        this.countryInScope(isocode);
                        nationalities.put(tag.getText(), isocode);
                    }
                }
            }
        }
    }
    personNameRule.evaluateNamedEntities(candidates, persons, orgs);
    matches.addAll(persons);
    matches.addAll(orgs);
}
Also used : Taxon(org.opensextant.data.Taxon) ArrayList(java.util.ArrayList) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ExtractionException(org.opensextant.extraction.ExtractionException) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Example 2 with Taxon

use of org.opensextant.data.Taxon in project Xponents by OpenSextant.

the class XponentsGeotagger method format.

private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
    Representation result = null;
    int tagCount = 0;
    JSONObject resultContent = new JSONObject();
    JSONObject resultMeta = new JSONObject();
    resultMeta.put("status", "ok");
    resultMeta.put("numfound", 0);
    JSONArray resultArray = new JSONArray();
    /*
		 * Super loop: Iterate through all found entities. record Taxons as
		 * person or orgs record Geo tags as country, place, or geo. geo =
		 * geocoded place or parsed coordinate (MGRS, DMS, etc)
		 * 
		 */
    for (TextMatch name : matches) {
        /*            
			 * ==========================
			 * ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
			 * ==========================             
			 */
        if (name instanceof TaxonMatch) {
            if (jobParams.output_taxons) {
                TaxonMatch match = (TaxonMatch) name;
                ++tagCount;
                for (Taxon n : match.getTaxons()) {
                    JSONObject node = populateMatch(name);
                    String t = "taxon";
                    String taxon_name = n.name.toLowerCase();
                    if (taxon_name.startsWith("org.")) {
                        t = "org";
                    } else if (taxon_name.startsWith("person.")) {
                        t = "person";
                    }
                    node.put("type", t);
                    // Name of taxon
                    node.put("taxon", n.name);
                    // Name of catalog or source
                    node.put("catalog", n.catalog);
                    // node.put("filtered-out", true);
                    resultArray.put(node);
                    break;
                }
            }
            continue;
        }
        // Ignore non-place tags
        if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
            continue;
        }
        JSONObject node = populateMatch(name);
        /*
			 * ==========================
			 * ANNOTATIONS: coordinates
			 * ==========================
			 */
        if (name instanceof GeocoordMatch) {
            ++tagCount;
            GeocoordMatch geo = (GeocoordMatch) name;
            node.put("type", "coordinate");
            Transforms.createGeocoding(geo, node);
            resultArray.put(node);
            continue;
        }
        if (name.isFilteredOut()) {
            debug("Filtered out " + name.getText());
            continue;
        }
        PlaceCandidate place = (PlaceCandidate) name;
        Place resolvedPlace = place.getChosen();
        /*
			 * ==========================
			 * ANNOTATIONS: countries, places, etc.
			 * ==========================
			 */
        /*
			 * Accept all country names as potential geotags Else if name can be
			 * filtered out, do it now. Otherwise it is a valid place name to
			 * consider
			 */
        ++tagCount;
        if (place.isCountry) {
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "country");
            node.put("cc", resolvedPlace.getCountryCode());
            node.put("confidence", place.getConfidence());
        } else {
            /*
				 * Conf = 20 or greater to be geocoded.
				 */
            Transforms.createGeocoding(resolvedPlace, node);
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "place");
            node.put("confidence", place.getConfidence());
            if (place.getConfidence() <= 10) {
                node.put("filtered-out", true);
            }
        }
        resultArray.put(node);
    }
    resultMeta.put("numfound", tagCount);
    resultContent.put("response", resultMeta);
    resultContent.put("annotations", resultArray);
    result = new JsonRepresentation(resultContent.toString(2));
    result.setCharacterSet(CharacterSet.UTF_8);
    return result;
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) JSONObject(org.json.JSONObject) Taxon(org.opensextant.data.Taxon) JSONArray(org.json.JSONArray) Representation(org.restlet.representation.Representation) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 3 with Taxon

use of org.opensextant.data.Taxon in project Xponents by OpenSextant.

the class Transforms method parseTaxon.

/**
	 * Parse out a taxon from JSON/REST
	 * @param x a taxon object
	 * @param t type of taxon
	 * @param a JSON annotation
	 */
public static void parseTaxon(TaxonMatch x, String t, JSONObject a) {
    x.setText(a.getString("matchtext"));
    if (a.has("taxon")) {
        Taxon tx = new Taxon();
        tx.setName(a.getString("taxon"));
        tx.catalog = a.getString("catalog");
        x.addTaxon(tx);
    }
    x.setType(t);
}
Also used : Taxon(org.opensextant.data.Taxon)

Example 4 with Taxon

use of org.opensextant.data.Taxon in project Xponents by OpenSextant.

the class TaxonMatcher method extractorImpl.

/**
     * Implementation details -- use with or without the formal ID/buffer
     * pairing.
     *
     * @param id
     *            doc id
     * @param buf
     *            input text
     * @return list of matches
     * @throws ExtractionException
     */
private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException {
    List<TextMatch> matches = new ArrayList<TextMatch>();
    String docid = (id != null ? id : NO_DOC_ID);
    Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
    QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap);
    @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
    log.debug("TAGS SIZE = {}", tags.size());
    /*
         * Retrieve all offsets into a long list.
         */
    TaxonMatch m = null;
    // int x1 = -1, x2 = -1;
    int tag_count = 0;
    String id_prefix = docid + "#";
    for (NamedList<?> tag : tags) {
        m = new TaxonMatch();
        m.start = ((Integer) tag.get("startOffset")).intValue();
        // +1 char after
        m.end = ((Integer) tag.get("endOffset")).intValue();
        // last matched
        // m.pattern_id = "taxtag";
        ++tag_count;
        m.match_id = id_prefix + tag_count;
        // m.setText((String) tag.get("matchText")); // Not reliable.
        // matchText can be null.
        m.setText(buf.substring(m.start, m.end));
        if (TextUtils.countFormattingSpace(m.getText()) > 1) {
            // Phrase with a single TAB is okay
            continue;
        }
        @SuppressWarnings("unchecked") List<Integer> taxonIDs = (List<Integer>) tag.get("ids");
        for (Integer solrId : taxonIDs) {
            Object refData = beanMap.get(solrId);
            if (refData == null) {
                continue;
            }
            /*
                 * Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO'
                 */
            Taxon tx = (Taxon) refData;
            if (this.filterNonAcronyms) {
                if (tx.isAcronym && !m.isUpper()) {
                    continue;
                }
            }
            m.addTaxon(tx);
        }
        //
        if (m.hasTaxons()) {
            matches.add(m);
        }
    }
    log.debug("FOUND LABELS count={}", matches.size());
    return matches;
}
Also used : HashMap(java.util.HashMap) NamedList(org.apache.solr.common.util.NamedList) Taxon(org.opensextant.data.Taxon) ArrayList(java.util.ArrayList) TextMatch(org.opensextant.extraction.TextMatch) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) SolrDocumentList(org.apache.solr.common.SolrDocumentList) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List)

Example 5 with Taxon

use of org.opensextant.data.Taxon in project Xponents by OpenSextant.

the class TaxonMatcher method createTaxon.

/**
     * Parse the taxon reference data from a solr doc and return Taxon obj.
     * 
     * @param refData
     *            solr doc
     * @return taxon obj
     */
public static Taxon createTaxon(SolrDocument refData) {
    Taxon label = new Taxon();
    label.name = SolrProxy.getString(refData, "taxnode");
    label.isAcronym = "A".equals(SolrProxy.getString(refData, "name_type"));
    label.catalog = SolrProxy.getString(refData, "catalog");
    label.addTerm(SolrProxy.getString(refData, "phrase"));
    label.addTags(refData.getFieldValues("tag"));
    return label;
}
Also used : Taxon(org.opensextant.data.Taxon)

Aggregations

Taxon (org.opensextant.data.Taxon)8 TextMatch (org.opensextant.extraction.TextMatch)4 ArrayList (java.util.ArrayList)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 QueryResponse (org.apache.solr.client.solrj.response.QueryResponse)2 SolrDocumentList (org.apache.solr.common.SolrDocumentList)2 File (java.io.File)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 List (java.util.List)1 JSONObject (net.sf.json.JSONObject)1 SolrServerException (org.apache.solr.client.solrj.SolrServerException)1 SolrDocument (org.apache.solr.common.SolrDocument)1 NamedList (org.apache.solr.common.util.NamedList)1 JSONArray (org.json.JSONArray)1 JSONObject (org.json.JSONObject)1 ConfigException (org.opensextant.ConfigException)1 Place (org.opensextant.data.Place)1 ExtractionException (org.opensextant.extraction.ExtractionException)1 PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)1