use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class GeoTaggerMapper method setup.
/**
* Setup. XTax or PlaceGecoder takes in SOLR path for xponents solr from JVM environment.
*/
@Override
public void setup(Context c) throws IOException {
super.setup(c);
try {
geocoder = new PlaceGeocoder();
geocoder.configure();
} catch (ConfigException e) {
// TODO Auto-generated catch block
throw new IOException("setup.PlaceGeocoder", e);
}
log.info("DONE");
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class PlaceGeocoder method configure.
/**
* We do whatever is needed to init resources... that varies depending on
* the use case.
*
* Guidelines: this class is custodian of the app controller, Corpus feeder,
* and any Document instances passed into/out of the feeder.
*
* This geocoder requires a default /exclusions/person-name-filter.txt,
* which can be empty, but most often it will be a list of person names
* (which are non-place names)
*
* Rules Configured in approximate order:
*
* <pre>
* CountryRule -- tag all country names
* NameCodeRule -- parse any Name, CODE, or Name1, Name2 patterns for "Place, AdminPlace" evidence
* PersonNameRule -- annotate, negate any patterns or matches that appear to be known celebrity persons or organizations.
* Qualified places are not negated, e.g., "Euguene, Oregon" is a place; "Euguene" with no other evidence is a person name.
* CoordRule -- if requested, parse any coordinate patterns; Reverse geocode Country + Province.
* ProvinceAssociationRule -- associate places with Province inferred by coordinates.
* MajorPlaceRule -- identify major places by feature type, class or location population.
* LocationChooserRule -- final rule that assigns confidence and chooses best location(s)
*
* Your Rule Here -- use addRule( GeocodeRule ) to add a rule on the stack. It will be evaluated just before the final LocationChooserRule.
* your rule should improve Place scores on PlaceCandidates and name the rules that fire.
* </pre>
*
* @throws ConfigException
* on err
*/
@Override
public void configure() throws ConfigException {
// ==============
// Rule setup: Create GeocodeRules, add them to this.rules if they can be evaluated generically
// on a list of place tags.
// Otherwise such rules are configured, set during the request, and evaluated adhoc as you need.
//
/* assess country names and codes */
countryRule = new CountryRule();
countryRule.setCountryObserver(this);
/* assess NAME, CODE patterns */
nameWithAdminRule = new NameCodeRule();
nameWithAdminRule.setBoundaryObserver(this);
// Nonsense is filtered out, rather than scored and ranked low.
nonsenseFilter = new NonsenseFilter();
rules.add(nonsenseFilter);
/**
* Files for Place Name filter are editable, as you likely have
* different ideas of who are "person names" to exclude when they
* conflict with place names. If you are filtering out such things, then
* it makes sense to filter them out earliest and not incorporate them
* in geocoding.
*
*/
personNameRule = new PersonNameFilter("/filters/person-name-filter.txt", "/filters/person-title-filter.txt", "/filters/person-suffix-filter.txt");
rules.add(personNameRule);
/*
* assess coordinates related to ADM1, CC
*/
coordRule = new CoordinateAssociationRule();
coordRule.setCountryObserver(this);
coordRule.setLocationObserver(this);
if (xcoord == null && (isCoordExtractionEnabled())) {
xcoord = new XCoord();
xcoord.configure();
/*
* assess ADM1 related to found NAMES as a result of coordinates
*/
adm1Rule = new ProvinceAssociationRule();
adm1Rule.setCountryObserver(this);
rules.add(coordRule);
rules.add(adm1Rule);
}
//
try {
Map<String, Integer> popstats = GeonamesUtility.mapPopulationByLocation(GeonamesUtility.loadMajorCities("/geonames.org/cities15000.txt"));
majorPlaceRule = new MajorPlaceRule(popstats);
} catch (IOException err) {
log.error("Xponents 2.8: cities population data is used for geocoding. Will continue without it.");
majorPlaceRule = new MajorPlaceRule(null);
}
majorPlaceRule.setCountryObserver(this);
majorPlaceRule.setBoundaryObserver(this);
rules.add(majorPlaceRule);
//
if (isPersonNameMatchingEnabled()) {
try {
personMatcher = new TaxonMatcher();
personMatcher.configure();
/*
* Default catalog must be built. Extraction ./XTax folder has
* script for populating a catalog.
*/
personMatcher.addCatalogFilter("JRC");
personMatcher.addCatalogFilter("nationality");
} catch (IOException err) {
throw new ConfigException("XTax resource not available.");
}
}
// Un-filter city names that can be resolved if other ADMIN places line up.
// E.g., "Cleveland Caveliers" filters out Cleveland, but if Cleveland is mentioned alone
// then Cleveland itself will be promoted to a location. E.g., sports teams travel
// so mention of "Cleveland Caveliers visiting Seattle" would not geolocate this to Ohio
// unless the city or state was mentioned separately.
//
placeInOrgRule = new ContextualOrganizationRule();
placeInOrgRule.setBoundaryObserver(this);
rules.add(placeInOrgRule);
// Simple patterns such as city of x or abc county.
//
rules.add(new NameRule());
chooser = new LocationChooserRule();
chooser.setCountryObserver(this);
chooser.setBoundaryObserver(this);
chooser.setLocationObserver(this);
//rules.add(chooser);
countryCatalog = this.getGazetteer().getCountries();
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class TagFilter method loadStopSet.
private void loadStopSet(URL url, String langid) throws IOException, ConfigException {
try (InputStream strm = url.openStream()) {
HashSet<String> stopTerms = new HashSet<>();
for (String line : IOUtils.readLines(strm, Charset.forName("UTF-8"))) {
if (line.trim().startsWith("#")) {
continue;
}
stopTerms.add(line.trim().toLowerCase());
}
if (stopTerms.isEmpty()) {
throw new ConfigException("No terms found in stop filter file " + url.toString());
}
langStopFilters.put(langid, stopTerms);
}
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class BasicGeoTemporalProcessing method setup.
/** Ideally you should separate your one-time initialization steps, configuring your extractors
* apart from the repetitive steps of setting up Jobs and Inputs. Outputs you might setup once
* for the entire JVM session, or it may be something you do periodically. In summary:
*
* configure separately:
* a) extractors, converters
* b) job inputs and parameters
* c) output formatters
* d) other resources, e.g., filters
*/
public void setup(String inFile, List<String> outFormats, String outFile, String tempDir) throws ConfigException, ProcessingException, IOException {
params.isdefault = false;
if (!validateParameters(inFile, outFormats, outFile, tempDir, params)) {
throw new ProcessingException("VALIDATION ERRORS: " + runnerMessage.toString());
}
// If you are dead-sure you want only coordinates from text, then just use XCoord.
// Otherwise SimpleGeocoder does both coords + names.
//
//XCoord xcoord = new XCoord();
//xcoord.configure();
//this.addExtractor(xcoord);
// Testing only
params.tag_places = true;
params.tag_coordinates = true;
params.output_countries = false;
PlaceGeocoder geocoder = new PlaceGeocoder();
geocoder.enablePersonNameMatching(true);
geocoder.setParameters(params);
geocoder.configure();
this.addExtractor(geocoder);
XTemporal xtemp = new XTemporal();
xtemp.configure();
this.addExtractor(xtemp);
converter = new XText();
converter.enableHTMLScrubber(false);
converter.enableSaving(true);
converter.enableOverwrite(false);
converter.setConversionListener(this);
//
if (tempDir != null) {
converter.getPathManager().setConversionCache(tempDir);
} else {
converter.enableSaving(false);
}
try {
converter.setup();
} catch (IOException ioerr) {
throw new ConfigException("Document converter could not start", ioerr);
}
this.params.inputFile = inFile.trim();
this.params.outputFile = outFile.trim();
if (outFormats != null) {
for (String fmt : outFormats) {
params.addOutputFormat(fmt);
AbstractFormatter formatter = createFormatter(fmt, params);
formatter.overwrite = overwriteOutput;
this.addFormatter(formatter);
//if (formatter instanceof CSVFormatter) {
// formatter.addField(OpenSextantSchema.FILEPATH.getName());
// formatter.addField(OpenSextantSchema.MATCH_TEXT.getName());
// }
formatter.start(params.getJobName());
}
}
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class SolrGazetteer method initialize.
/**
* Initialize. Cascading env variables: First use value from constructor,
* then opensextant.solr, then solr.solr.home
*
* @throws ConfigException
* Signals that a configuration exception has occurred.
*/
private void initialize(String solrHome) throws ConfigException {
solr = solrHome != null ? new SolrProxy(solrHome, "gazetteer") : new SolrProxy("gazetteer");
params.set(CommonParams.Q, "*:*");
params.set(CommonParams.FL, "id,name,cc,adm1,adm2,feat_class,feat_code,geo,place_id,name_bias,id_bias,name_type");
try {
this.countryCodes = loadCountries(solr.getInternalSolrServer());
} catch (SolrServerException loadErr) {
throw new ConfigException("SolrGazetteer is unable to load countries due to Solr error", loadErr);
} catch (IOException ioErr) {
throw new ConfigException("SolrGazetteer is unable to load countries due to IO/file error", ioErr);
}
}
Aggregations