use of org.opensextant.extractors.flexpat.RegexPattern in project Xponents by OpenSextant.
the class XCoord method extract_coordinates.
/**
* Limit the extraction to a particular family of coordinates. Diagnostic
* messages appear in TextMatchResultSet only when debug = ON.
*
* @param text
* text to match
* @param text_id
* id for text
* @param family
* pattern family or XConstants.ALL_PATTERNS
* @return TextMatchResultSet result set. If input is null, result set is
* null
*/
public TextMatchResult extract_coordinates(String text, String text_id, int family) {
if (text == null) {
return null;
}
int bufsize = text.length();
TextMatchResult results = new TextMatchResult();
results.result_id = text_id;
results.matches = new ArrayList<TextMatch>();
int patternsComplete = 0;
int found = 0;
for (RegexPattern repat : patterns.get_patterns()) {
log.debug("pattern={}", repat.id);
if (!repat.enabled) {
log.debug("CFG pattern={} not enabled", repat.id);
continue;
}
GeocoordPattern pat = (GeocoordPattern) repat;
// To limit multiple use enable_XXXX()
if (family != XConstants.ALL_PATTERNS && pat.cce_family_id != family) {
log.debug("CFG pattern={} not requested", pat.id);
continue;
}
Matcher match = pat.regex.matcher(text);
results.evaluated = true;
while (match.find()) {
++found;
GeocoordMatch coord = new GeocoordMatch();
// MATCH METHOD aka Pattern ID aka CCE instance
coord.pattern_id = pat.id;
coord.cce_family_id = pat.cce_family_id;
coord.cce_variant = pat.cce_variant;
coord.start = match.start();
coord.end = match.end();
coord.setText(match.group());
if ((RUNTIME_FLAGS & XConstants.CONTEXT_FILTERS_ON) > 0) {
if (this.filterOutContext(text, coord.start)) {
log.debug("Filtered out noisy match, {} found by {}", coord.getText(), pat.id);
continue;
}
}
// Normalize
try {
GeocoordNormalization.normalize_coordinate(coord, patterns.group_matches(pat, match));
} catch (NormalizationException normErr) {
if (debug) {
// Quietly ignore
results.message = "Parse error with '" + coord.getText() + "'";
log.error(results.message, normErr);
}
continue;
}
//
if (GeocoordNormalization.filter_out(coord)) {
if (debug) {
results.message = "Filtered out coordinate pattern=" + pat.id + " value='" + coord.getText() + "'";
log.info("Normalization Filter fired, MSG=" + results.message);
}
continue;
}
// Establish precision
GeocoordNormalization.set_precision(coord);
/**
* Caller may want to disable getContext operation here for
* short texts.... or for any use case. This is more helpful for
* longer texts with many annotations.
*/
if ((XCoord.RUNTIME_FLAGS & XConstants.FLAG_EXTRACT_CONTEXT) > 0) {
// returns indices for two windows before and after match
int[] slices = TextUtils.get_text_window(coord.start, coord.getLength(), bufsize, match_width);
// This sets the context window before/after.
//
coord.setContext(// left l1 to left l2
TextUtils.delete_eol(text.substring(slices[0], slices[1])), // right r1 to r2
TextUtils.delete_eol(text.substring(slices[2], slices[3])));
}
set_match_id(coord, found);
results.matches.add(coord);
//
if (coord.hasOtherIterpretations()) {
for (GeocoordMatch m2 : coord.getOtherInterpretations()) {
// Other interpretations may have different coord text.
// String _c = m2.coord_text;
m2.copyMetadata(coord);
// Preserve coordinate text of interpretation.
// m2.coord_text = _c;
results.matches.add(m2);
}
}
}
patternsComplete++;
updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
}
// "pass" is the wrong idea. If no data was found
// because there was no data, then it still passes.
//
results.pass = !results.matches.isEmpty();
PatternManager.reduce_matches(results.matches);
return results;
}
use of org.opensextant.extractors.flexpat.RegexPattern in project Xponents by OpenSextant.
the class XTemporal method extract_dates.
/**
* A direct call to extract dates; which is useful for diagnostics and
* development/testing.
*
* @param text
* @param text_id
* @return
*/
public TextMatchResult extract_dates(String text, String text_id) {
TextMatchResult results = new TextMatchResult();
results.matches = new ArrayList<TextMatch>();
results.result_id = text_id;
int found = 0;
int patternsComplete = 0;
for (RegexPattern pat : patterns.get_patterns()) {
log.debug("pattern={}", pat.id);
if (!pat.enabled) {
// results.message = "pattern=" + pat.id + " not enabled. ";
log.debug("CFG pattern={} not enabled.", pat.id);
continue;
}
Matcher match = pat.regex.matcher(text);
results.evaluated = true;
while (match.find()) {
++found;
DateMatch dt = new DateMatch();
dt.pattern_id = pat.id;
dt.start = match.start();
dt.end = match.end();
dt.setText(match.group());
try {
DateNormalization.normalize_date(patterns.group_map(pat, match), dt);
if (dt.datenorm == null) {
continue;
}
if ("YMD".equalsIgnoreCase(pat.family)) {
if (this.isDistantPastYMD(dt.datenorm)) {
continue;
}
}
dt.datenorm_text = DateNormalization.format_date(dt.datenorm);
// Flags worth setting here.
dt.isDistantPast = isDistantPast(dt.datenorm.getTime());
dt.isFuture = isFuture(dt.datenorm.getTime());
set_match_id(dt, found);
results.pass = true;
} catch (Exception err) {
// Not a date.
results.pass = false;
continue;
}
results.matches.add(dt);
}
patternsComplete++;
updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
}
results.pass = !results.matches.isEmpty();
PatternManager.reduce_matches(results.matches);
return results;
}
use of org.opensextant.extractors.flexpat.RegexPattern in project Xponents by OpenSextant.
the class PatternsOfLife method extract_patterns.
/**
* Extract patterns of a certain family from a block of text.
*
* @param text
* - data to process
* @param text_id
* - identifier for the data
* @param family
* - optional filter; to reuse the same PatManager but extract
* certain patterns only.
*
* @return PoliResult
*/
public TextMatchResult extract_patterns(String text, String text_id, String family) {
TextMatchResult results = new TextMatchResult();
results.result_id = text_id;
results.matches = new ArrayList<TextMatch>();
int bufsize = text.length();
PoliMatch poliMatch = null;
int found = 0;
int patternsComplete = 0;
for (RegexPattern repat : patterns.get_patterns()) {
if (!repat.enabled) {
continue;
}
if (family != null && !repat.id.startsWith(family)) {
continue;
}
Matcher match = repat.regex.matcher(text);
results.evaluated = true;
while (match.find()) {
++found;
Map<String, String> fields = patterns.group_map(repat, match);
if (repat.match_class == null) {
poliMatch = new PoliMatch(fields, match.group());
} else {
try {
poliMatch = (PoliMatch) repat.match_class.newInstance();
poliMatch.setText(match.group());
poliMatch.setGroups(fields);
} catch (InstantiationException classErr1) {
poliMatch = null;
log.error("Could not create... ", classErr1);
} catch (IllegalAccessException classErr2) {
poliMatch = null;
log.error("Could not create... ", classErr2);
}
}
if (poliMatch == null) {
// This would have been thrown at init.
log.error("Could not find pattern family for " + repat.id);
continue;
}
poliMatch.setType(repat.family);
poliMatch.pattern_id = repat.id;
poliMatch.start = match.start();
poliMatch.end = match.end();
poliMatch.normalize();
// Filter -- trivial filter is to filter out any coord that
// cannot
// TODO: Assess filters?
// returns indices for window around text match
int[] slices = TextUtils.get_text_window(poliMatch.start, bufsize, match_width);
// left l1 to left l2
poliMatch.setContext(TextUtils.delete_eol(text.substring(slices[0], slices[1])));
set_match_id(poliMatch, found);
results.matches.add(poliMatch);
}
patternsComplete++;
updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
}
results.pass = !results.matches.isEmpty();
PoliPatternManager.reduce_matches(results.matches);
return results;
}
Aggregations