Search in sources :

Example 11 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TaxonMatcher method extractorImpl.

/**
     * Implementation details -- use with or without the formal ID/buffer
     * pairing.
     *
     * @param id
     *            doc id
     * @param buf
     *            input text
     * @return list of matches
     * @throws ExtractionException
     */
private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException {
    List<TextMatch> matches = new ArrayList<TextMatch>();
    String docid = (id != null ? id : NO_DOC_ID);
    Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
    QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap);
    @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
    log.debug("TAGS SIZE = {}", tags.size());
    /*
         * Retrieve all offsets into a long list.
         */
    TaxonMatch m = null;
    // int x1 = -1, x2 = -1;
    int tag_count = 0;
    String id_prefix = docid + "#";
    for (NamedList<?> tag : tags) {
        m = new TaxonMatch();
        m.start = ((Integer) tag.get("startOffset")).intValue();
        // +1 char after
        m.end = ((Integer) tag.get("endOffset")).intValue();
        // last matched
        // m.pattern_id = "taxtag";
        ++tag_count;
        m.match_id = id_prefix + tag_count;
        // m.setText((String) tag.get("matchText")); // Not reliable.
        // matchText can be null.
        m.setText(buf.substring(m.start, m.end));
        if (TextUtils.countFormattingSpace(m.getText()) > 1) {
            // Phrase with a single TAB is okay
            continue;
        }
        @SuppressWarnings("unchecked") List<Integer> taxonIDs = (List<Integer>) tag.get("ids");
        for (Integer solrId : taxonIDs) {
            Object refData = beanMap.get(solrId);
            if (refData == null) {
                continue;
            }
            /*
                 * Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO'
                 */
            Taxon tx = (Taxon) refData;
            if (this.filterNonAcronyms) {
                if (tx.isAcronym && !m.isUpper()) {
                    continue;
                }
            }
            m.addTaxon(tx);
        }
        //
        if (m.hasTaxons()) {
            matches.add(m);
        }
    }
    log.debug("FOUND LABELS count={}", matches.size());
    return matches;
}
Also used : HashMap(java.util.HashMap) NamedList(org.apache.solr.common.util.NamedList) Taxon(org.opensextant.data.Taxon) ArrayList(java.util.ArrayList) TextMatch(org.opensextant.extraction.TextMatch) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) SolrDocumentList(org.apache.solr.common.SolrDocumentList) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List)

Example 12 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class CSVFormatter method writeGeocodingResult.

@Override
public void writeGeocodingResult(ExtractionResult rowdata) {
    HashMap<String, String> values = new HashMap<String, String>();
    for (TextMatch m : rowdata.matches) {
        values.clear();
        if (fieldSet.contains(OpenSextantSchema.FILEPATH.getName())) {
            values.put(OpenSextantSchema.FILEPATH.getName(), rowdata.recordFile);
        }
        buildRow(values, m);
        try {
            writer.write(values, header, outputSchema);
        } catch (Exception err) {
            log.error("Delayed error ERR:" + err.getLocalizedMessage());
        }
    }
}
Also used : HashMap(java.util.HashMap) TextMatch(org.opensextant.extraction.TextMatch) ConfigException(org.opensextant.ConfigException) ProcessingException(org.opensextant.processing.ProcessingException)

Example 13 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestGazMatcher method summarizeFindings.

public static void summarizeFindings(List<TextMatch> matches) {
    Set<String> placeNames = new TreeSet<>();
    Set<String> countryNames = new TreeSet<>();
    Set<String> coordinates = new TreeSet<>();
    System.out.println("MENTIONS ALL == " + matches.size());
    for (TextMatch tm : matches) {
        printGeoTags(tm);
        if (tm instanceof PlaceCandidate) {
            PlaceCandidate p = (PlaceCandidate) tm;
            if (tm.isFilteredOut()) {
                print("Filtered Out.  Rules = " + p.getRules());
                continue;
            }
            if (!p.getRules().isEmpty()) {
                print("Rules = " + p.getRules());
            }
            if (p.isCountry) {
                countryNames.add(p.getText());
            } else if (p.getChosen() != null) {
                print(String.format("\tgeocoded @ %s with conf=%d", p.getChosen(), p.getConfidence()));
                ScoredPlace alt = p.getSecondChoice();
                if (alt != null) {
                    print(String.format("\tgeocoded @ %s second place", alt));
                }
                placeNames.add(p.getText());
            } else {
                placeNames.add(p.getText());
            }
        } else if (tm.isFilteredOut()) {
            System.out.println("\t(filtered out: " + tm.getText() + ")");
            continue;
        }
        if (tm instanceof GeocoordMatch) {
            GeocoordMatch geo = (GeocoordMatch) tm;
            coordinates.add(geo.getText());
            if (geo.getRelatedPlace() != null) {
                System.out.println("Coordinate at place named " + geo.getRelatedPlace());
            }
        }
    }
    System.out.println("MENTIONS DISTINCT PLACES == " + placeNames.size());
    System.out.println(placeNames);
    System.out.println("MENTIONS COUNTRIES == " + countryNames.size());
    System.out.println(countryNames);
    System.out.println("MENTIONS COORDINATES == " + coordinates.size());
    System.out.println(coordinates);
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) TreeSet(java.util.TreeSet) ScoredPlace(org.opensextant.extractors.geo.ScoredPlace) TextMatch(org.opensextant.extraction.TextMatch) PlaceCandidate(org.opensextant.extractors.geo.PlaceCandidate)

Example 14 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class GISDataFormatter method writeGeocodingResult.

/**
     * Implementation of adding info extraction/geocoding restults to GIS outputs.
     */
@Override
public void writeGeocodingResult(ExtractionResult rowdata) {
    boolean error = false;
    log.debug("Adding data for File {} Count={}", rowdata.recordFile, rowdata.matches.size());
    for (TextMatch g : rowdata.matches) {
        if (filterOut(g)) {
            continue;
        }
        // Increment ID
        id++;
        // Only TextMatches that implement the Geocoding interface are
        // allowed here:
        Geocoding geocoding = getGeocoding(g);
        if (geocoding == null) {
            log.debug("Non-geo will be ignored: {}", g);
            continue;
        }
        log.debug("Add {}#{}", id, g);
        try {
            for (Feature row : gisDataModel.buildRows(id, geocoding, g, rowdata.attributes, rowdata)) {
                log.debug("FEATURE: {}", row);
                this.os.write(row);
            }
        } catch (ConfigException fieldErr) {
            if (!error) {
                log.error("OUTPUTTER, ERR=" + fieldErr);
            }
            error = true;
        }
    }
}
Also used : ConfigException(org.opensextant.ConfigException) TextMatch(org.opensextant.extraction.TextMatch) Geocoding(org.opensextant.data.Geocoding) Feature(org.opensextant.giscore.events.Feature)

Example 15 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestPlaceGeocoder method tagFile.

public void tagFile(File f, String langid) throws IOException {
    // Call as many times as you have documents...
    //
    TextInput in = new TextInput("test", FileUtility.readFile(f, "UTF-8"));
    in.langid = langid;
    try {
        List<TextMatch> matches = geocoder.extract(in);
        summarizeFindings(matches);
    } catch (Exception procErr) {
        procErr.printStackTrace();
    }
}
Also used : TextMatch(org.opensextant.extraction.TextMatch) TextInput(org.opensextant.data.TextInput) ExtractionException(org.opensextant.extraction.ExtractionException) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Aggregations

TextMatch (org.opensextant.extraction.TextMatch)26 IOException (java.io.IOException)9 ConfigException (org.opensextant.ConfigException)8 TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)6 ArrayList (java.util.ArrayList)5 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)5 Taxon (org.opensextant.data.Taxon)4 TextInput (org.opensextant.data.TextInput)4 Matcher (java.util.regex.Matcher)3 JSONObject (org.json.JSONObject)3 ExtractionException (org.opensextant.extraction.ExtractionException)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 File (java.io.File)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 JSONObject (net.sf.json.JSONObject)2 Text (org.apache.hadoop.io.Text)2 JSONArray (org.json.JSONArray)2