use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class Transforms method parseAnnotation.
/**
* Convert JSON object for an annotation into a Xponents TextMatch instance.
* Parsing data from JSON/REST representations has very limited capability compared to
* using Java API for processing routines directly.
*
* @param data
* @return
* @throws JSONException
*/
public static TextMatch parseAnnotation(Object data) throws JSONException {
if (!(data instanceof JSONObject)) {
return null;
}
TextMatch m = null;
JSONObject a = (JSONObject) data;
TaxonMatch x = null;
String typ = a.getString("type");
String text = a.getString("matchtext");
switch(typ) {
case "place":
PlaceCandidate placeMatch = new PlaceCandidate();
Place geo = new Place();
placeMatch.setText(text);
Transforms.parseGeocoding(geo, a);
placeMatch.setConfidence(a.optInt("confidence", -1));
placeMatch.choose(geo);
m = placeMatch;
break;
case "coordinate":
GeocoordMatch coord = new GeocoordMatch();
Place coordLoc = new Place();
coord.setText(text);
// How awful:.... need to parse Coord directly
Transforms.parseGeocoding(coordLoc, a);
coord.setLatLon(coordLoc);
coord.setMethod(coordLoc.getMethod());
/* TODO: GeocoordMatch needs to support setters for Geocoding here.
* missing reverse geo info
*
* cc, adm1
*
*/
m = coord;
break;
case "country":
PlaceCandidate countryMatch = new PlaceCandidate();
Place cc = new Place();
countryMatch.setText(text);
cc.setName(text);
countryMatch.setConfidence(a.optInt("confidence", -1));
cc.setCountryCode(a.getString("cc"));
countryMatch.isCountry = true;
countryMatch.choose(cc);
m = countryMatch;
break;
case "person":
x = new TaxonMatch();
Transforms.parseTaxon(x, "person", a);
m = x;
break;
case "org":
x = new TaxonMatch();
Transforms.parseTaxon(x, "org", a);
m = x;
break;
case "taxon":
x = new TaxonMatch();
Transforms.parseTaxon(x, "taxon", a);
m = x;
break;
default:
throw new JSONException("Unknown Annotation " + typ);
}
m.setType(typ);
m.start = a.getInt("offset");
m.end = m.start + a.getInt("length");
return m;
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class XponentsGeotagger method format.
private Representation format(List<TextMatch> matches, RequestParameters jobParams) throws JSONException {
Representation result = null;
int tagCount = 0;
JSONObject resultContent = new JSONObject();
JSONObject resultMeta = new JSONObject();
resultMeta.put("status", "ok");
resultMeta.put("numfound", 0);
JSONArray resultArray = new JSONArray();
/*
* Super loop: Iterate through all found entities. record Taxons as
* person or orgs record Geo tags as country, place, or geo. geo =
* geocoded place or parsed coordinate (MGRS, DMS, etc)
*
*/
for (TextMatch name : matches) {
/*
* ==========================
* ANNOTATIONS: non-geographic entities that are filtered out, but worth tracking
* ==========================
*/
if (name instanceof TaxonMatch) {
if (jobParams.output_taxons) {
TaxonMatch match = (TaxonMatch) name;
++tagCount;
for (Taxon n : match.getTaxons()) {
JSONObject node = populateMatch(name);
String t = "taxon";
String taxon_name = n.name.toLowerCase();
if (taxon_name.startsWith("org.")) {
t = "org";
} else if (taxon_name.startsWith("person.")) {
t = "person";
}
node.put("type", t);
// Name of taxon
node.put("taxon", n.name);
// Name of catalog or source
node.put("catalog", n.catalog);
// node.put("filtered-out", true);
resultArray.put(node);
break;
}
}
continue;
}
// Ignore non-place tags
if (name.isFilteredOut() || !(name instanceof PlaceCandidate || name instanceof GeocoordMatch)) {
continue;
}
JSONObject node = populateMatch(name);
/*
* ==========================
* ANNOTATIONS: coordinates
* ==========================
*/
if (name instanceof GeocoordMatch) {
++tagCount;
GeocoordMatch geo = (GeocoordMatch) name;
node.put("type", "coordinate");
Transforms.createGeocoding(geo, node);
resultArray.put(node);
continue;
}
if (name.isFilteredOut()) {
debug("Filtered out " + name.getText());
continue;
}
PlaceCandidate place = (PlaceCandidate) name;
Place resolvedPlace = place.getChosen();
/*
* ==========================
* ANNOTATIONS: countries, places, etc.
* ==========================
*/
/*
* Accept all country names as potential geotags Else if name can be
* filtered out, do it now. Otherwise it is a valid place name to
* consider
*/
++tagCount;
if (place.isCountry) {
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "country");
node.put("cc", resolvedPlace.getCountryCode());
node.put("confidence", place.getConfidence());
} else {
/*
* Conf = 20 or greater to be geocoded.
*/
Transforms.createGeocoding(resolvedPlace, node);
node.put("name", resolvedPlace.getPlaceName());
node.put("type", "place");
node.put("confidence", place.getConfidence());
if (place.getConfidence() <= 10) {
node.put("filtered-out", true);
}
}
resultArray.put(node);
}
resultMeta.put("numfound", tagCount);
resultContent.put("response", resultMeta);
resultContent.put("annotations", resultArray);
result = new JsonRepresentation(resultContent.toString(2));
result.setCharacterSet(CharacterSet.UTF_8);
return result;
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class XponentsGeotagger method process.
/**
* Process the text for the given document.
*
* @param input the input
* @param jobParams the job params
* @return the representation
*/
public Representation process(TextInput input, RequestParameters jobParams) {
if (input == null || input.buffer == null) {
return status("FAIL", "No text");
}
debug("Processing plain text doc");
++requestCount;
try {
if (prodMode) {
PlaceGeocoder xgeo = (PlaceGeocoder) getExtractor();
List<TextMatch> matches = xgeo.extract(input);
/*
* formulate matches as JSON output.
*/
return format(matches, jobParams);
}
} catch (Exception processingErr) {
error("Failure on doc " + input.id, processingErr);
return status("FAIL", processingErr.getMessage() + "; requests=" + requestCount);
}
return status("TEST", "nothing done in test with doc=" + input.id);
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class XlayerClientTest method main.
public static void main(String[] args) {
URL url;
try {
url = new URL(args[0]);
/*
* Create client.
*/
XlayerClient c = new XlayerClient(url);
try {
/*
* Prepare request. Text must be UTF-8 encoded.
* Note -- readFile() here assumes the file is unicode content
*
*/
String text = FileUtility.readFile(args[1]);
String docid = args[1];
/*
* Process the text and print results to console.
* Result is an array of TextMatch objects. For each particular
* TextMatch (Xponents Basic API), you have some common fields related to the
* text found, and then class-specific fields and objects you need to evaluate yourself.
*
* The XlayerClient process() method makes use of Transforms helper class to
* digest JSON annotations into Java API TextMatch objects of various flavors.
*/
List<TextMatch> results = c.process(docid, text);
for (TextMatch m : results) {
System.out.println(String.format("Found %s %s @ (%d:%d)", m.getType(), m.getText(), m.start, m.end));
}
} catch (Exception parseErr) {
parseErr.printStackTrace();
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ConfigException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class XCoord method extract_coordinates.
/**
* Limit the extraction to a particular family of coordinates. Diagnostic
* messages appear in TextMatchResultSet only when debug = ON.
*
* @param text
* text to match
* @param text_id
* id for text
* @param family
* pattern family or XConstants.ALL_PATTERNS
* @return TextMatchResultSet result set. If input is null, result set is
* null
*/
public TextMatchResult extract_coordinates(String text, String text_id, int family) {
if (text == null) {
return null;
}
int bufsize = text.length();
TextMatchResult results = new TextMatchResult();
results.result_id = text_id;
results.matches = new ArrayList<TextMatch>();
int patternsComplete = 0;
int found = 0;
for (RegexPattern repat : patterns.get_patterns()) {
log.debug("pattern={}", repat.id);
if (!repat.enabled) {
log.debug("CFG pattern={} not enabled", repat.id);
continue;
}
GeocoordPattern pat = (GeocoordPattern) repat;
// To limit multiple use enable_XXXX()
if (family != XConstants.ALL_PATTERNS && pat.cce_family_id != family) {
log.debug("CFG pattern={} not requested", pat.id);
continue;
}
Matcher match = pat.regex.matcher(text);
results.evaluated = true;
while (match.find()) {
++found;
GeocoordMatch coord = new GeocoordMatch();
// MATCH METHOD aka Pattern ID aka CCE instance
coord.pattern_id = pat.id;
coord.cce_family_id = pat.cce_family_id;
coord.cce_variant = pat.cce_variant;
coord.start = match.start();
coord.end = match.end();
coord.setText(match.group());
if ((RUNTIME_FLAGS & XConstants.CONTEXT_FILTERS_ON) > 0) {
if (this.filterOutContext(text, coord.start)) {
log.debug("Filtered out noisy match, {} found by {}", coord.getText(), pat.id);
continue;
}
}
// Normalize
try {
GeocoordNormalization.normalize_coordinate(coord, patterns.group_matches(pat, match));
} catch (NormalizationException normErr) {
if (debug) {
// Quietly ignore
results.message = "Parse error with '" + coord.getText() + "'";
log.error(results.message, normErr);
}
continue;
}
//
if (GeocoordNormalization.filter_out(coord)) {
if (debug) {
results.message = "Filtered out coordinate pattern=" + pat.id + " value='" + coord.getText() + "'";
log.info("Normalization Filter fired, MSG=" + results.message);
}
continue;
}
// Establish precision
GeocoordNormalization.set_precision(coord);
/**
* Caller may want to disable getContext operation here for
* short texts.... or for any use case. This is more helpful for
* longer texts with many annotations.
*/
if ((XCoord.RUNTIME_FLAGS & XConstants.FLAG_EXTRACT_CONTEXT) > 0) {
// returns indices for two windows before and after match
int[] slices = TextUtils.get_text_window(coord.start, coord.getLength(), bufsize, match_width);
// This sets the context window before/after.
//
coord.setContext(// left l1 to left l2
TextUtils.delete_eol(text.substring(slices[0], slices[1])), // right r1 to r2
TextUtils.delete_eol(text.substring(slices[2], slices[3])));
}
set_match_id(coord, found);
results.matches.add(coord);
//
if (coord.hasOtherIterpretations()) {
for (GeocoordMatch m2 : coord.getOtherInterpretations()) {
// Other interpretations may have different coord text.
// String _c = m2.coord_text;
m2.copyMetadata(coord);
// Preserve coordinate text of interpretation.
// m2.coord_text = _c;
results.matches.add(m2);
}
}
}
patternsComplete++;
updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
}
// "pass" is the wrong idea. If no data was found
// because there was no data, then it still passes.
//
results.pass = !results.matches.isEmpty();
PatternManager.reduce_matches(results.matches);
return results;
}
Aggregations