Search in sources :

Example 11 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class GeoTaggerMapper method setup.

/**
     * Setup.  XTax or PlaceGecoder takes in SOLR path for xponents solr from JVM environment.
     */
@Override
public void setup(Context c) throws IOException {
    super.setup(c);
    try {
        geocoder = new PlaceGeocoder();
        geocoder.configure();
    } catch (ConfigException e) {
        // TODO Auto-generated catch block
        throw new IOException("setup.PlaceGeocoder", e);
    }
    log.info("DONE");
}
Also used : PlaceGeocoder(org.opensextant.extractors.geo.PlaceGeocoder) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Example 12 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class PlaceGeocoder method configure.

/**
     * We do whatever is needed to init resources... that varies depending on
     * the use case.
     * 
     * Guidelines: this class is custodian of the app controller, Corpus feeder,
     * and any Document instances passed into/out of the feeder.
     * 
     * This geocoder requires a default /exclusions/person-name-filter.txt,
     * which can be empty, but most often it will be a list of person names
     * (which are non-place names)
     * 
     * Rules Configured in approximate order:
     * 
     * <pre>
     * CountryRule    -- tag all country names
     * NameCodeRule   -- parse any Name, CODE, or Name1, Name2 patterns for "Place, AdminPlace" evidence
     * PersonNameRule -- annotate, negate any patterns or matches that appear to be known celebrity persons or organizations.
     *                   Qualified places are not negated, e.g., "Euguene, Oregon" is a place;  "Euguene" with no other evidence is a person name.  
     * CoordRule      -- if requested, parse any coordinate patterns;  Reverse geocode Country + Province.
     * ProvinceAssociationRule  -- associate places with Province inferred by coordinates.
     * MajorPlaceRule -- identify major places by feature type, class or location population.
     * LocationChooserRule    -- final rule that assigns confidence and chooses best location(s)
     * 
     * Your Rule Here -- use addRule( GeocodeRule ) to add a rule on the stack.  It will be evaluated just before the final LocationChooserRule.
     * your rule should improve Place scores on PlaceCandidates and name the rules that fire.
     * </pre>
     *
     * @throws ConfigException
     *             on err
     */
@Override
public void configure() throws ConfigException {
    // ==============
    // Rule setup:  Create GeocodeRules, add them to this.rules if they can be evaluated generically
    // on a list of place tags.
    // Otherwise such rules are configured, set during the request, and evaluated adhoc as you need.
    // 
    /* assess country names and codes */
    countryRule = new CountryRule();
    countryRule.setCountryObserver(this);
    /* assess NAME, CODE patterns */
    nameWithAdminRule = new NameCodeRule();
    nameWithAdminRule.setBoundaryObserver(this);
    // Nonsense is filtered out, rather than scored and ranked low.
    nonsenseFilter = new NonsenseFilter();
    rules.add(nonsenseFilter);
    /**
         * Files for Place Name filter are editable, as you likely have
         * different ideas of who are "person names" to exclude when they
         * conflict with place names. If you are filtering out such things, then
         * it makes sense to filter them out earliest and not incorporate them
         * in geocoding.
         * 
         */
    personNameRule = new PersonNameFilter("/filters/person-name-filter.txt", "/filters/person-title-filter.txt", "/filters/person-suffix-filter.txt");
    rules.add(personNameRule);
    /*
         * assess coordinates related to ADM1, CC
         */
    coordRule = new CoordinateAssociationRule();
    coordRule.setCountryObserver(this);
    coordRule.setLocationObserver(this);
    if (xcoord == null && (isCoordExtractionEnabled())) {
        xcoord = new XCoord();
        xcoord.configure();
        /*
             * assess ADM1 related to found NAMES as a result of coordinates
             */
        adm1Rule = new ProvinceAssociationRule();
        adm1Rule.setCountryObserver(this);
        rules.add(coordRule);
        rules.add(adm1Rule);
    }
    //
    try {
        Map<String, Integer> popstats = GeonamesUtility.mapPopulationByLocation(GeonamesUtility.loadMajorCities("/geonames.org/cities15000.txt"));
        majorPlaceRule = new MajorPlaceRule(popstats);
    } catch (IOException err) {
        log.error("Xponents 2.8: cities population data is used for geocoding. Will continue without it.");
        majorPlaceRule = new MajorPlaceRule(null);
    }
    majorPlaceRule.setCountryObserver(this);
    majorPlaceRule.setBoundaryObserver(this);
    rules.add(majorPlaceRule);
    // 
    if (isPersonNameMatchingEnabled()) {
        try {
            personMatcher = new TaxonMatcher();
            personMatcher.configure();
            /*
                 * Default catalog must be built. Extraction ./XTax folder has
                 * script for populating a catalog.
                 */
            personMatcher.addCatalogFilter("JRC");
            personMatcher.addCatalogFilter("nationality");
        } catch (IOException err) {
            throw new ConfigException("XTax resource not available.");
        }
    }
    // Un-filter city names that can be resolved if other ADMIN places line up.
    // E.g., "Cleveland Caveliers" filters out Cleveland, but if Cleveland is mentioned alone
    // then Cleveland itself will be promoted to a location.  E.g., sports teams travel
    // so mention of "Cleveland Caveliers visiting Seattle" would not geolocate this to Ohio
    // unless the city or state was mentioned separately.  
    // 
    placeInOrgRule = new ContextualOrganizationRule();
    placeInOrgRule.setBoundaryObserver(this);
    rules.add(placeInOrgRule);
    // Simple patterns such as city of x or  abc county.
    // 
    rules.add(new NameRule());
    chooser = new LocationChooserRule();
    chooser.setCountryObserver(this);
    chooser.setBoundaryObserver(this);
    chooser.setLocationObserver(this);
    //rules.add(chooser);
    countryCatalog = this.getGazetteer().getCountries();
}
Also used : MajorPlaceRule(org.opensextant.extractors.geo.rules.MajorPlaceRule) LocationChooserRule(org.opensextant.extractors.geo.rules.LocationChooserRule) NameRule(org.opensextant.extractors.geo.rules.NameRule) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) TaxonMatcher(org.opensextant.extractors.xtax.TaxonMatcher) ContextualOrganizationRule(org.opensextant.extractors.geo.rules.ContextualOrganizationRule) XCoord(org.opensextant.extractors.xcoord.XCoord) CountryRule(org.opensextant.extractors.geo.rules.CountryRule) CoordinateAssociationRule(org.opensextant.extractors.geo.rules.CoordinateAssociationRule) ProvinceAssociationRule(org.opensextant.extractors.geo.rules.ProvinceAssociationRule) NonsenseFilter(org.opensextant.extractors.geo.rules.NonsenseFilter) PersonNameFilter(org.opensextant.extractors.geo.rules.PersonNameFilter) NameCodeRule(org.opensextant.extractors.geo.rules.NameCodeRule)

Example 13 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class TagFilter method loadStopSet.

private void loadStopSet(URL url, String langid) throws IOException, ConfigException {
    try (InputStream strm = url.openStream()) {
        HashSet<String> stopTerms = new HashSet<>();
        for (String line : IOUtils.readLines(strm, Charset.forName("UTF-8"))) {
            if (line.trim().startsWith("#")) {
                continue;
            }
            stopTerms.add(line.trim().toLowerCase());
        }
        if (stopTerms.isEmpty()) {
            throw new ConfigException("No terms found in stop filter file " + url.toString());
        }
        langStopFilters.put(langid, stopTerms);
    }
}
Also used : InputStream(java.io.InputStream) ConfigException(org.opensextant.ConfigException) HashSet(java.util.HashSet)

Example 14 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class BasicGeoTemporalProcessing method setup.

/**   Ideally you should separate your one-time initialization steps, configuring your extractors
     * apart from the repetitive steps of setting up Jobs and Inputs.   Outputs you might setup once
     * for the entire JVM session, or it may be something you do periodically.  In summary:
     *
     * configure separately:
     *   a) extractors, converters
     *   b) job inputs and parameters
     *   c) output formatters
     *   d) other resources, e.g., filters
     */
public void setup(String inFile, List<String> outFormats, String outFile, String tempDir) throws ConfigException, ProcessingException, IOException {
    params.isdefault = false;
    if (!validateParameters(inFile, outFormats, outFile, tempDir, params)) {
        throw new ProcessingException("VALIDATION ERRORS: " + runnerMessage.toString());
    }
    // If you are dead-sure you want only coordinates from text, then just use XCoord.
    // Otherwise SimpleGeocoder does both coords + names.
    // 
    //XCoord xcoord = new XCoord();
    //xcoord.configure();
    //this.addExtractor(xcoord);
    // Testing only
    params.tag_places = true;
    params.tag_coordinates = true;
    params.output_countries = false;
    PlaceGeocoder geocoder = new PlaceGeocoder();
    geocoder.enablePersonNameMatching(true);
    geocoder.setParameters(params);
    geocoder.configure();
    this.addExtractor(geocoder);
    XTemporal xtemp = new XTemporal();
    xtemp.configure();
    this.addExtractor(xtemp);
    converter = new XText();
    converter.enableHTMLScrubber(false);
    converter.enableSaving(true);
    converter.enableOverwrite(false);
    converter.setConversionListener(this);
    // 
    if (tempDir != null) {
        converter.getPathManager().setConversionCache(tempDir);
    } else {
        converter.enableSaving(false);
    }
    try {
        converter.setup();
    } catch (IOException ioerr) {
        throw new ConfigException("Document converter could not start", ioerr);
    }
    this.params.inputFile = inFile.trim();
    this.params.outputFile = outFile.trim();
    if (outFormats != null) {
        for (String fmt : outFormats) {
            params.addOutputFormat(fmt);
            AbstractFormatter formatter = createFormatter(fmt, params);
            formatter.overwrite = overwriteOutput;
            this.addFormatter(formatter);
            //if (formatter instanceof CSVFormatter) {
            //    formatter.addField(OpenSextantSchema.FILEPATH.getName());
            //    formatter.addField(OpenSextantSchema.MATCH_TEXT.getName());
            // }
            formatter.start(params.getJobName());
        }
    }
}
Also used : PlaceGeocoder(org.opensextant.extractors.geo.PlaceGeocoder) XTemporal(org.opensextant.extractors.xtemporal.XTemporal) XText(org.opensextant.xtext.XText) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) AbstractFormatter(org.opensextant.output.AbstractFormatter) ProcessingException(org.opensextant.processing.ProcessingException)

Example 15 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class SolrGazetteer method initialize.

/**
     * Initialize. Cascading env variables: First use value from constructor,
     * then opensextant.solr, then solr.solr.home
     *
     * @throws ConfigException
     *             Signals that a configuration exception has occurred.
     */
private void initialize(String solrHome) throws ConfigException {
    solr = solrHome != null ? new SolrProxy(solrHome, "gazetteer") : new SolrProxy("gazetteer");
    params.set(CommonParams.Q, "*:*");
    params.set(CommonParams.FL, "id,name,cc,adm1,adm2,feat_class,feat_code,geo,place_id,name_bias,id_bias,name_type");
    try {
        this.countryCodes = loadCountries(solr.getInternalSolrServer());
    } catch (SolrServerException loadErr) {
        throw new ConfigException("SolrGazetteer is unable to load countries due to Solr error", loadErr);
    } catch (IOException ioErr) {
        throw new ConfigException("SolrGazetteer is unable to load countries due to IO/file error", ioErr);
    }
}
Also used : SolrProxy(org.opensextant.util.SolrProxy) SolrServerException(org.apache.solr.client.solrj.SolrServerException) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Aggregations

ConfigException (org.opensextant.ConfigException)28 IOException (java.io.IOException)20 File (java.io.File)5 URL (java.net.URL)4 MimeTypeParseException (javax.activation.MimeTypeParseException)3 MessagingException (javax.mail.MessagingException)3 SolrServerException (org.apache.solr.client.solrj.SolrServerException)3 PlaceGeocoder (org.opensextant.extractors.geo.PlaceGeocoder)3 XText (org.opensextant.xtext.XText)3 PSTFile (com.pff.PSTFile)2 MalformedURLException (java.net.MalformedURLException)2 HashSet (java.util.HashSet)2 TextMatch (org.opensextant.extraction.TextMatch)2 PersonNameFilter (org.opensextant.extractors.geo.rules.PersonNameFilter)2 TaxonMatcher (org.opensextant.extractors.xtax.TaxonMatcher)2 XTemporal (org.opensextant.extractors.xtemporal.XTemporal)2 PSTException (com.pff.PSTException)1 LongOpt (gnu.getopt.LongOpt)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1