Search in sources :

Example 1 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class Transforms method parseAnnotation.

/**
	 * Convert JSON object for an annotation into a Xponents TextMatch instance.
	 * Parsing data from JSON/REST representations has very limited capability compared to
	 * using Java API for processing routines directly.
	 * 
	 * @param data
	 * @return
	 * @throws JSONException
	 */
public static TextMatch parseAnnotation(Object data) throws JSONException {
    if (!(data instanceof JSONObject)) {
        return null;
    }
    TextMatch m = null;
    JSONObject a = (JSONObject) data;
    TaxonMatch x = null;
    String typ = a.getString("type");
    String text = a.getString("matchtext");
    switch(typ) {
        case "place":
            PlaceCandidate placeMatch = new PlaceCandidate();
            Place geo = new Place();
            placeMatch.setText(text);
            Transforms.parseGeocoding(geo, a);
            placeMatch.setConfidence(a.optInt("confidence", -1));
            placeMatch.choose(geo);
            m = placeMatch;
            break;
        case "coordinate":
            GeocoordMatch coord = new GeocoordMatch();
            Place coordLoc = new Place();
            coord.setText(text);
            // How awful:.... need to parse Coord directly
            Transforms.parseGeocoding(coordLoc, a);
            coord.setLatLon(coordLoc);
            coord.setMethod(coordLoc.getMethod());
            /* TODO: GeocoordMatch needs to support setters for Geocoding here.
			 * missing reverse geo info
			 * 
			 *  cc, adm1
			 *  
			 */
            m = coord;
            break;
        case "country":
            PlaceCandidate countryMatch = new PlaceCandidate();
            Place cc = new Place();
            countryMatch.setText(text);
            cc.setName(text);
            countryMatch.setConfidence(a.optInt("confidence", -1));
            cc.setCountryCode(a.getString("cc"));
            countryMatch.isCountry = true;
            countryMatch.choose(cc);
            m = countryMatch;
            break;
        case "person":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "person", a);
            m = x;
            break;
        case "org":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "org", a);
            m = x;
            break;
        case "taxon":
            x = new TaxonMatch();
            Transforms.parseTaxon(x, "taxon", a);
            m = x;
            break;
        default:
            throw new JSONException("Unknown Annotation " + typ);
    }
    m.setType(typ);
    m.start = a.getInt("offset");
    m.end = m.start + a.getInt("length");
    return m;
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) JSONObject(org.json.JSONObject) JSONException(org.json.JSONException) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 2 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class XponentsGeotagger method format.

private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
    Representation result = null;
    int tagCount = 0;
    JSONObject resultContent = new JSONObject();
    JSONObject resultMeta = new JSONObject();
    resultMeta.put("status", "ok");
    resultMeta.put("numfound", 0);
    JSONArray resultArray = new JSONArray();
    /*
		 * Super loop: Iterate through all found entities. record Taxons as
		 * person or orgs record Geo tags as country, place, or geo. geo =
		 * geocoded place or parsed coordinate (MGRS, DMS, etc)
		 * 
		 */
    for (TextMatch name : matches) {
        /*            
			 * ==========================
			 * ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
			 * ==========================             
			 */
        if (name instanceof TaxonMatch) {
            if (jobParams.output_taxons) {
                TaxonMatch match = (TaxonMatch) name;
                ++tagCount;
                for (Taxon n : match.getTaxons()) {
                    JSONObject node = populateMatch(name);
                    String t = "taxon";
                    String taxon_name = n.name.toLowerCase();
                    if (taxon_name.startsWith("org.")) {
                        t = "org";
                    } else if (taxon_name.startsWith("person.")) {
                        t = "person";
                    }
                    node.put("type", t);
                    // Name of taxon
                    node.put("taxon", n.name);
                    // Name of catalog or source
                    node.put("catalog", n.catalog);
                    // node.put("filtered-out", true);
                    resultArray.put(node);
                    break;
                }
            }
            continue;
        }
        // Ignore non-place tags
        if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
            continue;
        }
        JSONObject node = populateMatch(name);
        /*
			 * ==========================
			 * ANNOTATIONS: coordinates
			 * ==========================
			 */
        if (name instanceof GeocoordMatch) {
            ++tagCount;
            GeocoordMatch geo = (GeocoordMatch) name;
            node.put("type", "coordinate");
            Transforms.createGeocoding(geo, node);
            resultArray.put(node);
            continue;
        }
        if (name.isFilteredOut()) {
            debug("Filtered out " + name.getText());
            continue;
        }
        PlaceCandidate place = (PlaceCandidate) name;
        Place resolvedPlace = place.getChosen();
        /*
			 * ==========================
			 * ANNOTATIONS: countries, places, etc.
			 * ==========================
			 */
        /*
			 * Accept all country names as potential geotags Else if name can be
			 * filtered out, do it now. Otherwise it is a valid place name to
			 * consider
			 */
        ++tagCount;
        if (place.isCountry) {
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "country");
            node.put("cc", resolvedPlace.getCountryCode());
            node.put("confidence", place.getConfidence());
        } else {
            /*
				 * Conf = 20 or greater to be geocoded.
				 */
            Transforms.createGeocoding(resolvedPlace, node);
            node.put("name", resolvedPlace.getPlaceName());
            node.put("type", "place");
            node.put("confidence", place.getConfidence());
            if (place.getConfidence() <= 10) {
                node.put("filtered-out", true);
            }
        }
        resultArray.put(node);
    }
    resultMeta.put("numfound", tagCount);
    resultContent.put("response", resultMeta);
    resultContent.put("annotations", resultArray);
    result = new JsonRepresentation(resultContent.toString(2));
    result.setCharacterSet(CharacterSet.UTF_8);
    return result;
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) JSONObject(org.json.JSONObject) Taxon(org.opensextant.data.Taxon) JSONArray(org.json.JSONArray) Representation(org.restlet.representation.Representation) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) JsonRepresentation(org.restlet.ext.json.JsonRepresentation) Place(org.opensextant.data.Place) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 3 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class XponentsGeotagger method process.

/**
	 * Process the text for the given document.
	 *
	 * @param input     the input
	 * @param jobParams the job params
	 * @return the representation
	 */
public Representation process(TextInput input, RequestParameters jobParams) {
    if (input == null || input.buffer == null) {
        return status("FAIL", "No text");
    }
    debug("Processing plain text doc");
    ++requestCount;
    try {
        if (prodMode) {
            PlaceGeocoder xgeo = (PlaceGeocoder) getExtractor();
            List<TextMatch> matches = xgeo.extract(input);
            /*
				 * formulate matches as JSON output.
				 */
            return format(matches, jobParams);
        }
    } catch (Exception processingErr) {
        error("Failure on doc " + input.id, processingErr);
        return status("FAIL", processingErr.getMessage() + "; requests=" + requestCount);
    }
    return status("TEST", "nothing done in test with doc=" + input.id);
}
Also used : PlaceGeocoder(org.opensextant.extractors.geo.PlaceGeocoder) TextMatch(org.opensextant.extraction.TextMatch) JSONException(org.json.JSONException)

Example 4 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class XlayerClientTest method main.

public static void main(String[] args) {
    URL url;
    try {
        url = new URL(args[0]);
        /*
             * Create client.
             */
        XlayerClient c = new XlayerClient(url);
        try {
            /* 
                 * Prepare request.  Text must be UTF-8 encoded.
                 * Note -- readFile() here assumes the file is unicode content
                 * 
                 */
            String text = FileUtility.readFile(args[1]);
            String docid = args[1];
            /*
                 * Process the text and print results to console.
                 * Result is an array of TextMatch objects.  For each particular
                 * TextMatch (Xponents Basic API), you have some common fields related to the 
                 * text found, and then class-specific fields and objects you need to evaluate yourself.
                 * 
                 * The XlayerClient process() method makes use of Transforms helper class to 
                 * digest JSON annotations into Java API TextMatch objects of various flavors.
                 */
            List<TextMatch> results = c.process(docid, text);
            for (TextMatch m : results) {
                System.out.println(String.format("Found %s %s @ (%d:%d)", m.getType(), m.getText(), m.start, m.end));
            }
        } catch (Exception parseErr) {
            parseErr.printStackTrace();
        }
    } catch (MalformedURLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (ConfigException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : MalformedURLException(java.net.MalformedURLException) XlayerClient(org.opensextant.xlayer.XlayerClient) ConfigException(org.opensextant.ConfigException) TextMatch(org.opensextant.extraction.TextMatch) URL(java.net.URL) MalformedURLException(java.net.MalformedURLException) ConfigException(org.opensextant.ConfigException)

Example 5 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class XCoord method extract_coordinates.

/**
     * Limit the extraction to a particular family of coordinates. Diagnostic
     * messages appear in TextMatchResultSet only when debug = ON.
     *
     * @param text
     *            text to match
     * @param text_id
     *            id for text
     * @param family
     *            pattern family or XConstants.ALL_PATTERNS
     * @return TextMatchResultSet result set. If input is null, result set is
     *         null
     */
public TextMatchResult extract_coordinates(String text, String text_id, int family) {
    if (text == null) {
        return null;
    }
    int bufsize = text.length();
    TextMatchResult results = new TextMatchResult();
    results.result_id = text_id;
    results.matches = new ArrayList<TextMatch>();
    int patternsComplete = 0;
    int found = 0;
    for (RegexPattern repat : patterns.get_patterns()) {
        log.debug("pattern={}", repat.id);
        if (!repat.enabled) {
            log.debug("CFG pattern={} not enabled", repat.id);
            continue;
        }
        GeocoordPattern pat = (GeocoordPattern) repat;
        // To limit multiple use enable_XXXX()
        if (family != XConstants.ALL_PATTERNS && pat.cce_family_id != family) {
            log.debug("CFG pattern={} not requested", pat.id);
            continue;
        }
        Matcher match = pat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            GeocoordMatch coord = new GeocoordMatch();
            // MATCH METHOD aka Pattern ID aka CCE instance
            coord.pattern_id = pat.id;
            coord.cce_family_id = pat.cce_family_id;
            coord.cce_variant = pat.cce_variant;
            coord.start = match.start();
            coord.end = match.end();
            coord.setText(match.group());
            if ((RUNTIME_FLAGS & XConstants.CONTEXT_FILTERS_ON) > 0) {
                if (this.filterOutContext(text, coord.start)) {
                    log.debug("Filtered out noisy match, {} found by {}", coord.getText(), pat.id);
                    continue;
                }
            }
            // Normalize
            try {
                GeocoordNormalization.normalize_coordinate(coord, patterns.group_matches(pat, match));
            } catch (NormalizationException normErr) {
                if (debug) {
                    // Quietly ignore
                    results.message = "Parse error with '" + coord.getText() + "'";
                    log.error(results.message, normErr);
                }
                continue;
            }
            //
            if (GeocoordNormalization.filter_out(coord)) {
                if (debug) {
                    results.message = "Filtered out coordinate pattern=" + pat.id + " value='" + coord.getText() + "'";
                    log.info("Normalization Filter fired, MSG=" + results.message);
                }
                continue;
            }
            // Establish precision
            GeocoordNormalization.set_precision(coord);
            /**
                 * Caller may want to disable getContext operation here for
                 * short texts.... or for any use case. This is more helpful for
                 * longer texts with many annotations.
                 */
            if ((XCoord.RUNTIME_FLAGS & XConstants.FLAG_EXTRACT_CONTEXT) > 0) {
                // returns indices for two windows before and after match
                int[] slices = TextUtils.get_text_window(coord.start, coord.getLength(), bufsize, match_width);
                // This sets the context window before/after.
                //
                coord.setContext(// left l1 to left l2
                TextUtils.delete_eol(text.substring(slices[0], slices[1])), // right r1 to r2
                TextUtils.delete_eol(text.substring(slices[2], slices[3])));
            }
            set_match_id(coord, found);
            results.matches.add(coord);
            //
            if (coord.hasOtherIterpretations()) {
                for (GeocoordMatch m2 : coord.getOtherInterpretations()) {
                    // Other interpretations may have different coord text.
                    // String _c = m2.coord_text;
                    m2.copyMetadata(coord);
                    // Preserve coordinate text of interpretation.
                    // m2.coord_text = _c;
                    results.matches.add(m2);
                }
            }
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    // "pass" is the wrong idea. If no data was found
    // because there was no data, then it still passes.
    //
    results.pass = !results.matches.isEmpty();
    PatternManager.reduce_matches(results.matches);
    return results;
}
Also used : NormalizationException(org.opensextant.extraction.NormalizationException) RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Aggregations

TextMatch (org.opensextant.extraction.TextMatch)26 IOException (java.io.IOException)9 ConfigException (org.opensextant.ConfigException)8 TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)6 ArrayList (java.util.ArrayList)5 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)5 Taxon (org.opensextant.data.Taxon)4 TextInput (org.opensextant.data.TextInput)4 Matcher (java.util.regex.Matcher)3 JSONObject (org.json.JSONObject)3 ExtractionException (org.opensextant.extraction.ExtractionException)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 File (java.io.File)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 JSONObject (net.sf.json.JSONObject)2 Text (org.apache.hadoop.io.Text)2 JSONArray (org.json.JSONArray)2