use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class PlaceGeocoder method parseKnownNonPlaces.
/**
* If no geo matches are found, we still parse the data if person name matching is enabled.
* Poor-man's named-entity extraction
*
* @throws ExtractionException
*
*/
private void parseKnownNonPlaces(TextInput input, List<PlaceCandidate> candidates, List<TextMatch> matches) {
if (!isPersonNameMatchingEnabled()) {
return;
}
// If this step fails miserably, do not raise error. Log the error and return nothing found.
//
List<TextMatch> nonPlaces = null;
try {
nonPlaces = personMatcher.extract(input.buffer);
if (nonPlaces.isEmpty()) {
return;
}
} catch (Exception err) {
log.error(err.getMessage());
return;
}
List<TaxonMatch> persons = new ArrayList<>();
List<TaxonMatch> orgs = new ArrayList<>();
log.debug("Matched {}", nonPlaces.size());
for (TextMatch tm : nonPlaces) {
if (!(tm instanceof TaxonMatch)) {
continue;
}
TaxonMatch tag = (TaxonMatch) tm;
//
// For the purposes of geocoding/geoparsing filter out ALL
// TaxonMatches. Any place names should reside back in
// gazetteer. If XTax does have place or location data, that would be new.
//
tm.setFilteredOut(true);
for (Taxon taxon : tag.getTaxons()) {
String node = taxon.name.toLowerCase();
// name spans that are not places.
if (node.startsWith("person.")) {
persons.add(tag);
break;
} else if (node.startsWith("org.")) {
if (taxon.isAcronym && !tm.isUpper()) {
continue;
}
orgs.add(tag);
break;
} else if (node.startsWith("nationality.")) {
persons.add(tag);
// The tag may be absent as some ethnicities may be mixed in and indicate no country.
for (String t : taxon.tagset) {
int x = t.indexOf("cc+");
if (x >= 0) {
String isocode = t.substring(x + 3);
this.countryInScope(isocode);
nationalities.put(tag.getText(), isocode);
}
}
}
}
}
personNameRule.evaluateNamedEntities(candidates, persons, orgs);
matches.addAll(persons);
matches.addAll(orgs);
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class PlaceGeocoder method extract.
/**
* Unfinished Beta; ready for experimentation and improvement on rules.
*
* Extractor.extract() calls first XCoord to get coordinates, then
* PlacenameMatcher In the end you have all geo entities ranked and scored.
*
* LangID can be set on TextInput input.langid. Only lowercase langIDs please:
* 'zh', 'ar', tag text for those languages in particular. Null and Other values
* are treated as generic as of v2.8.
*
* <pre>
* Use TextMatch.getType()
* to determine how to interpret TextMatch / Geocoding results:
*
* Given TextMatch match
*
* Place tag: ((PlaceCandiate)match).getGeocoding()
* Coord tag: (Geocoding)match
*
* Both methods yield a geocoding.
* </pre>
*
* @param input
* input buffer, doc ID, and optional langID.
* @return TextMatch instances which are all PlaceCandidates.
* @throws ExtractionException
* on err
*/
@Override
public List<TextMatch> extract(TextInput input) throws ExtractionException {
long t1 = System.currentTimeMillis();
reset();
List<TextMatch> matches = new ArrayList<TextMatch>();
List<TextMatch> coordinates = null;
// 0. GEOTAG raw text. Flag tag-only = false, in otherwords do extra work for geocoding.
//
List<PlaceCandidate> candidates = null;
if (input.langid == null) {
candidates = tagText(input.buffer, input.id, tagOnly);
//} else if (TextUtils.isCJK(input.langid)) {
// candidates = this.tagCJKText(input.buffer, input.id, tagOnly);
} else if (TextUtils.arabicLang.equals(input.langid)) {
candidates = this.tagArabicText(input.buffer, input.id, tagOnly);
} else {
// Default - unknown language.
log.debug("Default Language {}. Treating as Generic.", input.langid);
candidates = tagText(input, tagOnly);
}
// 1. COORDINATES. If caller thinks their data may have coordinates, then attempt to parse
// lat/lon. Any coordinates found fire rules for resolve lat/lon to a Province/Country if possible.
//
coordinates = parseGeoCoordinates(input);
if (coordinates != null) {
matches.addAll(coordinates);
}
/*
* 3.RULE EVALUATION: accumulate all the evidence from everything found so far.
* Assemble some histograms to support some basic counts, weighting and sorting.
*
* Rules: Work with observables first, then move onto associations between candidates and more obscure fine tuning.
* 1a. Country - named country weighs heavily;
* 1b. Place, Boundary -- a city or location, followed/qualified by a geopolitical boundary name or code. Paris, France; Paris, Texas.
* 1c. Coordinate rule -- coordinates emit Province ID and Country ID if possible. So inferred Provinces are weighted heavily.
* b. Person name rule - filters out heavily, making use of JRC Names and your own data sets as a TaxCat catalog/tagger.
* d. Major Places rule -- well-known large cities, capitals or provinces are weighted moderately.
* e. Province association rule -- for each found place, weight geos falling in Provinces positively ID'd.
* f. Location Chooser rule -- assemble all evidence and account for weights.
*/
countryRule.evaluate(candidates);
nameWithAdminRule.evaluate(candidates);
// 2. NON-PLACE ID. Tag person and org names to negate celebrity names or well-known
// individuals who share a city name. "Tom Jackson", "Bill Clinton"
//
parseKnownNonPlaces(input, candidates, matches);
// Measure duration of tagging.
this.taggingTimes.addTimeSince(t1);
//
for (GeocodeRule r : rules) {
r.evaluate(candidates);
}
// Last rule: score, choose, add confidence.
//
chooser.evaluate(candidates);
// For each candidate, if PlaceCandidate.chosen is not null,
// add chosen (Geocoding) to matches
// Otherwise add PlaceCandidates to matches.
// non-geocoded matches will appear in non-GIS formats.
//
// Downstream recipients of 'matches' must know how to parse through
// evaluated place candidates. We send the candidates and all evidence.
matches.addAll(candidates);
// Measure full processing duration for this doc.
this.matcherTotalTimes.addBytes(input.buffer.length());
this.matcherTotalTimes.addTimeSince(t1);
return matches;
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class XtractorGroup method process.
/**
* Process one input. If you have no need for formatting output at this time
* use this. If you have complext ExtractionResults where you want to add
* meta attributes, then you would use this approach
*/
public List<TextMatch> process(TextInput input) {
List<TextMatch> oneResultSet = new ArrayList<TextMatch>();
progressMonitor.setNumberOfSteps(extractors.size());
/**
* Process all extraction and compile on a single list.
*/
for (Extractor x : extractors) {
try {
List<TextMatch> results = x.extract(input);
if (results != null && !results.isEmpty()) {
oneResultSet.addAll(results);
}
} catch (ExtractionException loopErr) {
log.error("Extractor=" + x.getName() + "on Input=" + input.id, loopErr);
currErrors.add("Extractor=" + x.getName() + " ERR=" + loopErr.getMessage());
}
}
progressMonitor.completeDocument();
return oneResultSet;
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class RegexPatternManager method reduce_matches.
/**
* This operates on the listed objects, flagging each match as distinct, overlapping with other match or if it is completely contained within other match.
* @param matches a list of related matches from a single text
*/
public static void reduce_matches(List<TextMatch> matches) {
int len = matches.size();
for (int i = 0; i < len; ++i) {
TextMatch M = matches.get(i);
long m1 = M.start;
long m2 = M.end;
// Compare from
for (int j = i + 1; j < len; ++j) {
TextMatch N = matches.get(j);
long n1 = N.start;
long n2 = N.end;
if (m2 < n1) {
// M before N entirely
continue;
}
if (m1 > n2) {
// M after N entirely
continue;
}
// Same span, but duplicate.
if (n1 == m1 && n2 == m2) {
N.is_duplicate = true;
M.is_overlap = true;
continue;
}
// M entirely within N
if (n1 <= m1 && m2 <= n2) {
M.is_submatch = true;
N.is_overlap = true;
continue;
}
// N entirely within M
if (n1 >= m1 && m2 >= n2) {
M.is_overlap = true;
N.is_submatch = true;
continue;
}
// Overlapping spans
M.is_overlap = true;
N.is_overlap = true;
}
}
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class GeoTaggerMapper method map.
/**
*
*/
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
++counter;
TextInput textObj = null;
try {
textObj = prepareInput(null, textRecord);
} catch (java.lang.NullPointerException npe) {
log.error("Failed on record {}", textRecord.toString().substring(0, 50));
}
if (textObj == null) {
return;
}
/* LANG ID = 'ENGLISH',
* If this is not true, then you need to add LangID to your metadata or detect it live
*/
textObj.langid = "en";
HashSet<String> dedup = new HashSet<>();
try {
List<TextMatch> matches = geocoder.extract(textObj);
if (matches.isEmpty()) {
return;
}
Text oid = new Text(textObj.id);
/* NORMALIZE findings.
* Reduce all matches, minimizing duplicates, removing whitespace, etc.
*
*/
int filtered = 0, duplicates = 0;
for (TextMatch tm : matches) {
/* DEDUPLICATE */
if (dedup.contains(tm.getText())) {
duplicates += 1;
continue;
}
/* FILTER OUT NOISE */
if (filterOutMatch(tm)) {
continue;
}
/* FORMAT */
JSONObject o = match2JSON(tm);
dedup.add(tm.getText());
Text matchOutput = new Text(o.toString());
/* SERIALIZE GEOCODING */
context.write(oid, matchOutput);
}
if (log.isTraceEnabled()) {
log.trace("For key {}, found={}, junk filtered={}, duplicates={}", key.toString(), matches.size(), filtered, duplicates);
}
} catch (Exception err) {
log.error("Error running geotagger", err);
}
}
Aggregations