use of org.opensextant.extraction.NormalizationException in project Xponents by OpenSextant.
the class XCoord method extract_coordinates.
/**
* Limit the extraction to a particular family of coordinates. Diagnostic
* messages appear in TextMatchResultSet only when debug = ON.
*
* @param text
* text to match
* @param text_id
* id for text
* @param family
* pattern family or XConstants.ALL_PATTERNS
* @return TextMatchResultSet result set. If input is null, result set is
* null
*/
public TextMatchResult extract_coordinates(String text, String text_id, int family) {
if (text == null) {
return null;
}
int bufsize = text.length();
TextMatchResult results = new TextMatchResult();
results.result_id = text_id;
results.matches = new ArrayList<TextMatch>();
int patternsComplete = 0;
int found = 0;
for (RegexPattern repat : patterns.get_patterns()) {
log.debug("pattern={}", repat.id);
if (!repat.enabled) {
log.debug("CFG pattern={} not enabled", repat.id);
continue;
}
GeocoordPattern pat = (GeocoordPattern) repat;
// To limit multiple use enable_XXXX()
if (family != XConstants.ALL_PATTERNS && pat.cce_family_id != family) {
log.debug("CFG pattern={} not requested", pat.id);
continue;
}
Matcher match = pat.regex.matcher(text);
results.evaluated = true;
while (match.find()) {
++found;
GeocoordMatch coord = new GeocoordMatch();
// MATCH METHOD aka Pattern ID aka CCE instance
coord.pattern_id = pat.id;
coord.cce_family_id = pat.cce_family_id;
coord.cce_variant = pat.cce_variant;
coord.start = match.start();
coord.end = match.end();
coord.setText(match.group());
if ((RUNTIME_FLAGS & XConstants.CONTEXT_FILTERS_ON) > 0) {
if (this.filterOutContext(text, coord.start)) {
log.debug("Filtered out noisy match, {} found by {}", coord.getText(), pat.id);
continue;
}
}
// Normalize
try {
GeocoordNormalization.normalize_coordinate(coord, patterns.group_matches(pat, match));
} catch (NormalizationException normErr) {
if (debug) {
// Quietly ignore
results.message = "Parse error with '" + coord.getText() + "'";
log.error(results.message, normErr);
}
continue;
}
//
if (GeocoordNormalization.filter_out(coord)) {
if (debug) {
results.message = "Filtered out coordinate pattern=" + pat.id + " value='" + coord.getText() + "'";
log.info("Normalization Filter fired, MSG=" + results.message);
}
continue;
}
// Establish precision
GeocoordNormalization.set_precision(coord);
/**
* Caller may want to disable getContext operation here for
* short texts.... or for any use case. This is more helpful for
* longer texts with many annotations.
*/
if ((XCoord.RUNTIME_FLAGS & XConstants.FLAG_EXTRACT_CONTEXT) > 0) {
// returns indices for two windows before and after match
int[] slices = TextUtils.get_text_window(coord.start, coord.getLength(), bufsize, match_width);
// This sets the context window before/after.
//
coord.setContext(// left l1 to left l2
TextUtils.delete_eol(text.substring(slices[0], slices[1])), // right r1 to r2
TextUtils.delete_eol(text.substring(slices[2], slices[3])));
}
set_match_id(coord, found);
results.matches.add(coord);
//
if (coord.hasOtherIterpretations()) {
for (GeocoordMatch m2 : coord.getOtherInterpretations()) {
// Other interpretations may have different coord text.
// String _c = m2.coord_text;
m2.copyMetadata(coord);
// Preserve coordinate text of interpretation.
// m2.coord_text = _c;
results.matches.add(m2);
}
}
}
patternsComplete++;
updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
}
// "pass" is the wrong idea. If no data was found
// because there was no data, then it still passes.
//
results.pass = !results.matches.isEmpty();
PatternManager.reduce_matches(results.matches);
return results;
}
use of org.opensextant.extraction.NormalizationException in project Xponents by OpenSextant.
the class GeocoordNormalization method normalize_coordinate.
/**
* The match object is normalized, setting the coord_text and other data
* from parsing "text" and knowing which pattern family was matched.
*
* @param m match
* @param groups fields
* @throws NormalizationException
*/
public static void normalize_coordinate(GeocoordMatch m, Map<String, TextEntity> groups) throws NormalizationException {
// Hoaky Java 6 issue: REGEX does not use named groups, so here we map both the value to
// a text/offset pair (in groups) and provide just the key/text pairs (_elements)
//
Map<String, String> fieldValues = new HashMap<String, String>();
for (String name : groups.keySet()) {
TextEntity val = groups.get(name);
fieldValues.put(name, val.getText());
}
//
if (m.cce_family_id == XConstants.DD_PATTERN) {
// get lat text
// lon text -- remove whitespace from both
// coord_text = lat + ' ' + lon
// set lat, lon
//
// decDegLat, decDegLon, degSym, hemiLat, hemiLon
//
DMSOrdinate ddlat = new DMSOrdinate(groups, fieldValues, DMLAT, m.getText());
DMSOrdinate ddlon = new DMSOrdinate(groups, fieldValues, DMLON, m.getText());
// Yield a cooridnate-only version of text; "+42.4440 -102.3333"
// preserving the innate precision given in the original text.
//
m.lat_text = ddlat.text;
m.lon_text = ddlon.text;
m.setSeparator(groups);
m.setCoordinate(ddlat, ddlon);
/**
* DD filters enabled.
*
* To Disable: XCoord.RUNTIME_FLAGS XOR XConstants.DD_FILTERS_ON
*/
if ((XCoord.RUNTIME_FLAGS & XConstants.DD_FILTERS_ON) > 0) {
/**
* With FILTERS ON if lat/lon have no ALPHA hemisphere, i.e.,
* ENSW * and if lat/lon text for match has no COORD symbology
* then this is likely not a DD coordinate -- filter out.
*/
if (!ddlon.hemisphere.isAlpha() && !ddlat.hemisphere.isAlpha()) {
if (!ddlat.hasSymbols()) {
m.setFilteredOut(true);
}
}
} else {
// DD filters OFF, so do not filter out
m.setFilteredOut(!GeodeticUtility.validateCoordinate(m.getLatitude(), m.getLongitude()));
}
m.coord_text = m.lat_text + " " + m.lon_text;
} else if (m.cce_family_id == XConstants.DM_PATTERN) {
// get lat text
// lon text -- remove whitespace from both
// coord_text = lat + ' ' + lon
// set lat, lon
//
DMSOrdinate dmlat = new DMSOrdinate(groups, fieldValues, DMLAT, m.getText());
DMSOrdinate dmlon = new DMSOrdinate(groups, fieldValues, DMLON, m.getText());
m.lat_text = dmlat.text;
m.lon_text = dmlon.text;
m.setSeparator(groups);
m.setCoordinate(dmlat, dmlon);
if (!m.isFilteredOut()) {
m.setFilteredOut(m.evaluateInvalidDashes());
}
m.coord_text = m.lat_text + " " + m.lon_text;
} else if (m.cce_family_id == XConstants.DMS_PATTERN) {
// remove whitespace
// set lat, lon
//
DMSOrdinate dmlat = new DMSOrdinate(groups, fieldValues, DMLAT, m.getText());
DMSOrdinate dmlon = new DMSOrdinate(groups, fieldValues, DMLON, m.getText());
m.lat_text = dmlat.text;
m.lon_text = dmlon.text;
m.setSeparator(groups);
m.setCoordinate(dmlat, dmlon);
if (!m.isFilteredOut()) {
m.setFilteredOut(m.evaluateInvalidDashes());
}
m.coord_text = m.lat_text + " " + m.lon_text;
} else if (m.cce_family_id == XConstants.MGRS_PATTERN) {
// Capture the normalized coord text just to aid in reporting in
// error situations
//
m.coord_text = TextUtils.delete_whitespace(m.getText());
// TODO: make use of multiple answers.
try {
MGRS[] mgrs_candidates = MGRSParser.parseMGRS(m.getText(), m.coord_text, fieldValues);
//
if (mgrs_candidates != null) {
MGRS mgrs = mgrs_candidates[0];
m.coord_text = mgrs.toString();
Geodetic2DPoint pt = mgrs.toGeodetic2DPoint();
m.setLatitude(pt.getLatitudeAsDegrees());
m.setLongitude(pt.getLongitudeAsDegrees());
m.setBalanced(true);
if (mgrs_candidates.length == 2) {
mgrs = mgrs_candidates[1];
GeocoordMatch m2 = new GeocoordMatch();
m2.copy(m);
m2.coord_text = mgrs.toString();
pt = mgrs.toGeodetic2DPoint();
m2.setLatitude(pt.getLatitudeAsDegrees());
m2.setLongitude(pt.getLongitudeAsDegrees());
// Really balanced?
m2.setBalanced(true);
m.addOtherInterpretation(m2);
}
}
} catch (java.lang.IllegalArgumentException parseErr) {
//.debug("Failed to parse MGRS pattern with text=" + m.getText() + " COORD?:"
// + m.coord_text, parseErr);
// No normalization was possible as this match represents an invalid MGRS value
//
m.setFilteredOut(true);
} catch (Exception err) {
throw new NormalizationException("Failed to parse MGRS", err);
}
} else if (m.cce_family_id == XConstants.UTM_PATTERN) {
m.coord_text = TextUtils.delete_whitespace(m.getText());
try {
UTM utm = UTMParser.parseUTM(m.coord_text, fieldValues);
if (utm != null) {
Geodetic2DPoint pt = utm.getGeodetic();
m.setLatitude(pt.getLatitudeAsDegrees());
m.setLongitude(pt.getLongitudeAsDegrees());
m.coord_text = utm.toString();
}
} catch (java.lang.IllegalArgumentException parseErr) {
throw new NormalizationException("Failed to parse UTM pattern with text=" + m.getText() + " COORD?:" + m.coord_text, parseErr);
// No normalization done.
} catch (Exception err) {
throw new NormalizationException("Failed to parse UTM pattern", err);
}
}
}
use of org.opensextant.extraction.NormalizationException in project Xponents by OpenSextant.
the class TestPoLi method main.
/**
* Run a simple test.
*
* @param args
* only one argument accepted: a text file input.
*/
public static void main(String[] args) {
boolean debug = true;
boolean systemTest = false;
String testFile = null;
String config = null;
try {
gnu.getopt.Getopt opts = new gnu.getopt.Getopt("Poli", args, "c:u:f");
int c;
while ((c = opts.getopt()) != -1) {
switch(c) {
case 'f':
System.out.println("\tSystem TESTS======= ");
systemTest = true;
break;
case 'u':
testFile = opts.getOptarg();
System.out.println("\tUser TESTS======= FILE=" + testFile);
break;
case 'c':
config = opts.getOptarg();
System.out.println("\tUser Patterns Configuration ======= FILE=" + config);
break;
default:
TestPoLi.usage();
System.exit(1);
}
}
} catch (Exception runErr) {
runErr.printStackTrace();
TestPoLi.usage();
System.exit(1);
}
PatternsOfLife poli = null;
try {
// Use default config file.
poli = new PatternsOfLife(debug);
if (config == null) {
// default
poli.configure();
} else {
poli.configure(config);
}
} catch (ConfigException xerr) {
xerr.printStackTrace();
System.exit(-1);
}
try {
TestPoLiReporter test = new TestPoLiReporter(poli);
if (systemTest) {
test.test();
} else if (testFile != null) {
test.testUserFile(testFile);
}
} catch (NormalizationException xerr) {
xerr.printStackTrace();
} catch (IOException ioerr) {
ioerr.printStackTrace();
}
}
Aggregations