Search in sources :

Example 16 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class PlaceGeocoder method parseKnownNonPlaces.

/**
     * If no geo matches are found, we still parse the data if person name matching is enabled.
     * Poor-man's named-entity extraction
     * 
     * @throws ExtractionException
     * 
     */
private void parseKnownNonPlaces(TextInput input, List<PlaceCandidate> candidates, List<TextMatch> matches) {
    if (!isPersonNameMatchingEnabled()) {
        return;
    }
    // If this step fails miserably, do not raise error. Log the error and return nothing found.
    // 
    List<TextMatch> nonPlaces = null;
    try {
        nonPlaces = personMatcher.extract(input.buffer);
        if (nonPlaces.isEmpty()) {
            return;
        }
    } catch (Exception err) {
        log.error(err.getMessage());
        return;
    }
    List<TaxonMatch> persons = new ArrayList<>();
    List<TaxonMatch> orgs = new ArrayList<>();
    log.debug("Matched {}", nonPlaces.size());
    for (TextMatch tm : nonPlaces) {
        if (!(tm instanceof TaxonMatch)) {
            continue;
        }
        TaxonMatch tag = (TaxonMatch) tm;
        //
        // For the purposes of geocoding/geoparsing filter out ALL
        // TaxonMatches. Any place names should reside back in
        // gazetteer. If XTax does have place or location data, that would be new.
        //
        tm.setFilteredOut(true);
        for (Taxon taxon : tag.getTaxons()) {
            String node = taxon.name.toLowerCase();
            // name spans that are not places.
            if (node.startsWith("person.")) {
                persons.add(tag);
                break;
            } else if (node.startsWith("org.")) {
                if (taxon.isAcronym && !tm.isUpper()) {
                    continue;
                }
                orgs.add(tag);
                break;
            } else if (node.startsWith("nationality.")) {
                persons.add(tag);
                // The tag may be absent as some ethnicities may be mixed in and indicate no country.
                for (String t : taxon.tagset) {
                    int x = t.indexOf("cc+");
                    if (x >= 0) {
                        String isocode = t.substring(x + 3);
                        this.countryInScope(isocode);
                        nationalities.put(tag.getText(), isocode);
                    }
                }
            }
        }
    }
    personNameRule.evaluateNamedEntities(candidates, persons, orgs);
    matches.addAll(persons);
    matches.addAll(orgs);
}
Also used : Taxon(org.opensextant.data.Taxon) ArrayList(java.util.ArrayList) TextMatch(org.opensextant.extraction.TextMatch) TaxonMatch(org.opensextant.extractors.xtax.TaxonMatch) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ExtractionException(org.opensextant.extraction.ExtractionException) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Example 17 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class PlaceGeocoder method extract.

/**
     * Unfinished Beta; ready for experimentation and improvement on rules.
     *
     * Extractor.extract() calls first XCoord to get coordinates, then
     * PlacenameMatcher In the end you have all geo entities ranked and scored.
     * 
     * LangID can be set on TextInput input.langid. Only lowercase langIDs please:
     * 'zh', 'ar', tag text for those languages in particular. Null and Other values
     * are treated as generic as of v2.8.
     * 
     * <pre>
     * Use TextMatch.getType()
     * to determine how to interpret TextMatch / Geocoding results:
     *
     * Given TextMatch match
     *
     *    Place tag:   ((PlaceCandiate)match).getGeocoding()
     *    Coord tag:   (Geocoding)match
     *
     * Both methods yield a geocoding.
     * </pre>
     *
     * @param input
     *            input buffer, doc ID, and optional langID.
     * @return TextMatch instances which are all PlaceCandidates.
     * @throws ExtractionException
     *             on err
     */
@Override
public List<TextMatch> extract(TextInput input) throws ExtractionException {
    long t1 = System.currentTimeMillis();
    reset();
    List<TextMatch> matches = new ArrayList<TextMatch>();
    List<TextMatch> coordinates = null;
    // 0. GEOTAG raw text. Flag tag-only = false, in otherwords do extra work for geocoding.
    //
    List<PlaceCandidate> candidates = null;
    if (input.langid == null) {
        candidates = tagText(input.buffer, input.id, tagOnly);
    //} else if (TextUtils.isCJK(input.langid)) {
    // candidates = this.tagCJKText(input.buffer, input.id, tagOnly);
    } else if (TextUtils.arabicLang.equals(input.langid)) {
        candidates = this.tagArabicText(input.buffer, input.id, tagOnly);
    } else {
        // Default - unknown language.
        log.debug("Default Language {}. Treating as Generic.", input.langid);
        candidates = tagText(input, tagOnly);
    }
    // 1. COORDINATES. If caller thinks their data may have coordinates, then attempt to parse
    // lat/lon.  Any coordinates found fire rules for resolve lat/lon to a Province/Country if possible.
    //
    coordinates = parseGeoCoordinates(input);
    if (coordinates != null) {
        matches.addAll(coordinates);
    }
    /*
         * 3.RULE EVALUATION: accumulate all the evidence from everything found so far.
         * Assemble some histograms to support some basic counts, weighting and sorting.
         * 
         * Rules:  Work with observables first, then move onto associations between candidates and more obscure fine tuning. 
         * 1a.  Country - named country weighs heavily; 
         * 1b.  Place, Boundary -- a city or location, followed/qualified by a geopolitical boundary name or code. Paris, France; Paris, Texas.
         * 1c.  Coordinate rule -- coordinates emit Province ID and Country ID if possible. So inferred Provinces are weighted heavily.
         * b.  Person name rule - filters out heavily, making use of JRC Names and your own data sets as a TaxCat catalog/tagger.
         * d.  Major Places rule -- well-known large cities, capitals or provinces are weighted moderately.
         * e.  Province association rule -- for each found place, weight geos falling in Provinces positively ID'd.
         * f.  Location Chooser rule -- assemble all evidence and account for weights.
         */
    countryRule.evaluate(candidates);
    nameWithAdminRule.evaluate(candidates);
    // 2. NON-PLACE ID. Tag person and org names to negate celebrity names or well-known
    // individuals who share a city name. "Tom Jackson", "Bill Clinton"
    //
    parseKnownNonPlaces(input, candidates, matches);
    // Measure duration of tagging.
    this.taggingTimes.addTimeSince(t1);
    // 
    for (GeocodeRule r : rules) {
        r.evaluate(candidates);
    }
    // Last rule: score, choose, add confidence.
    // 
    chooser.evaluate(candidates);
    // For each candidate, if PlaceCandidate.chosen is not null,
    // add chosen (Geocoding) to matches
    // Otherwise add PlaceCandidates to matches.
    // non-geocoded matches will appear in non-GIS formats.
    //
    // Downstream recipients of 'matches' must know how to parse through
    // evaluated place candidates. We send the candidates and all evidence.
    matches.addAll(candidates);
    // Measure full processing duration for this doc.
    this.matcherTotalTimes.addBytes(input.buffer.length());
    this.matcherTotalTimes.addTimeSince(t1);
    return matches;
}
Also used : ArrayList(java.util.ArrayList) TextMatch(org.opensextant.extraction.TextMatch) GeocodeRule(org.opensextant.extractors.geo.rules.GeocodeRule)

Example 18 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class XtractorGroup method process.

/**
     * Process one input. If you have no need for formatting output at this time
     * use this. If you have complext ExtractionResults where you want to add
     * meta attributes, then you would use this approach
     */
public List<TextMatch> process(TextInput input) {
    List<TextMatch> oneResultSet = new ArrayList<TextMatch>();
    progressMonitor.setNumberOfSteps(extractors.size());
    /**
         * Process all extraction and compile on a single list.
         */
    for (Extractor x : extractors) {
        try {
            List<TextMatch> results = x.extract(input);
            if (results != null && !results.isEmpty()) {
                oneResultSet.addAll(results);
            }
        } catch (ExtractionException loopErr) {
            log.error("Extractor=" + x.getName() + "on Input=" + input.id, loopErr);
            currErrors.add("Extractor=" + x.getName() + " ERR=" + loopErr.getMessage());
        }
    }
    progressMonitor.completeDocument();
    return oneResultSet;
}
Also used : ExtractionException(org.opensextant.extraction.ExtractionException) ArrayList(java.util.ArrayList) TextMatch(org.opensextant.extraction.TextMatch) Extractor(org.opensextant.extraction.Extractor)

Example 19 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class RegexPatternManager method reduce_matches.

/**
     * This operates on the listed objects, flagging each match as distinct, overlapping with other match or if it is completely contained within other match.
     * @param matches  a list of related matches from a single text
     */
public static void reduce_matches(List<TextMatch> matches) {
    int len = matches.size();
    for (int i = 0; i < len; ++i) {
        TextMatch M = matches.get(i);
        long m1 = M.start;
        long m2 = M.end;
        // Compare from
        for (int j = i + 1; j < len; ++j) {
            TextMatch N = matches.get(j);
            long n1 = N.start;
            long n2 = N.end;
            if (m2 < n1) {
                // M before N entirely
                continue;
            }
            if (m1 > n2) {
                // M after N entirely
                continue;
            }
            // Same span, but duplicate.
            if (n1 == m1 && n2 == m2) {
                N.is_duplicate = true;
                M.is_overlap = true;
                continue;
            }
            // M entirely within N
            if (n1 <= m1 && m2 <= n2) {
                M.is_submatch = true;
                N.is_overlap = true;
                continue;
            }
            // N entirely within M
            if (n1 >= m1 && m2 >= n2) {
                M.is_overlap = true;
                N.is_submatch = true;
                continue;
            }
            // Overlapping spans
            M.is_overlap = true;
            N.is_overlap = true;
        }
    }
}
Also used : TextMatch(org.opensextant.extraction.TextMatch)

Example 20 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class GeoTaggerMapper method map.

/**
     * 
     */
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
    ++counter;
    TextInput textObj = null;
    try {
        textObj = prepareInput(null, textRecord);
    } catch (java.lang.NullPointerException npe) {
        log.error("Failed on record {}", textRecord.toString().substring(0, 50));
    }
    if (textObj == null) {
        return;
    }
    /* LANG ID = 'ENGLISH',
         * If this is not true, then you need to add LangID to your metadata or detect it live
         */
    textObj.langid = "en";
    HashSet<String> dedup = new HashSet<>();
    try {
        List<TextMatch> matches = geocoder.extract(textObj);
        if (matches.isEmpty()) {
            return;
        }
        Text oid = new Text(textObj.id);
        /* NORMALIZE findings.
             * Reduce all matches, minimizing duplicates, removing whitespace, etc.
             *
             */
        int filtered = 0, duplicates = 0;
        for (TextMatch tm : matches) {
            /* DEDUPLICATE */
            if (dedup.contains(tm.getText())) {
                duplicates += 1;
                continue;
            }
            /* FILTER OUT NOISE */
            if (filterOutMatch(tm)) {
                continue;
            }
            /* FORMAT */
            JSONObject o = match2JSON(tm);
            dedup.add(tm.getText());
            Text matchOutput = new Text(o.toString());
            /* SERIALIZE GEOCODING */
            context.write(oid, matchOutput);
        }
        if (log.isTraceEnabled()) {
            log.trace("For key {}, found={}, junk filtered={}, duplicates={}", key.toString(), matches.size(), filtered, duplicates);
        }
    } catch (Exception err) {
        log.error("Error running geotagger", err);
    }
}
Also used : JSONObject(net.sf.json.JSONObject) Text(org.apache.hadoop.io.Text) TextMatch(org.opensextant.extraction.TextMatch) TextInput(org.opensextant.data.TextInput) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Aggregations

TextMatch (org.opensextant.extraction.TextMatch)26 IOException (java.io.IOException)9 ConfigException (org.opensextant.ConfigException)8 TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)6 ArrayList (java.util.ArrayList)5 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)5 Taxon (org.opensextant.data.Taxon)4 TextInput (org.opensextant.data.TextInput)4 Matcher (java.util.regex.Matcher)3 JSONObject (org.json.JSONObject)3 ExtractionException (org.opensextant.extraction.ExtractionException)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 File (java.io.File)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 JSONObject (net.sf.json.JSONObject)2 Text (org.apache.hadoop.io.Text)2 JSONArray (org.json.JSONArray)2