use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class TaxonMatcher method extractorImpl.
/**
* Implementation details -- use with or without the formal ID/buffer
* pairing.
*
* @param id
* doc id
* @param buf
* input text
* @return list of matches
* @throws ExtractionException
*/
private List<TextMatch> extractorImpl(String id, String buf) throws ExtractionException {
List<TextMatch> matches = new ArrayList<TextMatch>();
String docid = (id != null ? id : NO_DOC_ID);
Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
QueryResponse response = tagTextCallSolrTagger(buf, docid, beanMap);
@SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");
log.debug("TAGS SIZE = {}", tags.size());
/*
* Retrieve all offsets into a long list.
*/
TaxonMatch m = null;
// int x1 = -1, x2 = -1;
int tag_count = 0;
String id_prefix = docid + "#";
for (NamedList<?> tag : tags) {
m = new TaxonMatch();
m.start = ((Integer) tag.get("startOffset")).intValue();
// +1 char after
m.end = ((Integer) tag.get("endOffset")).intValue();
// last matched
// m.pattern_id = "taxtag";
++tag_count;
m.match_id = id_prefix + tag_count;
// m.setText((String) tag.get("matchText")); // Not reliable.
// matchText can be null.
m.setText(buf.substring(m.start, m.end));
if (TextUtils.countFormattingSpace(m.getText()) > 1) {
// Phrase with a single TAB is okay
continue;
}
@SuppressWarnings("unchecked") List<Integer> taxonIDs = (List<Integer>) tag.get("ids");
for (Integer solrId : taxonIDs) {
Object refData = beanMap.get(solrId);
if (refData == null) {
continue;
}
/*
* Filter out non-Acronyms. e.g., 'who' is not a match for 'WHO'
*/
Taxon tx = (Taxon) refData;
if (this.filterNonAcronyms) {
if (tx.isAcronym && !m.isUpper()) {
continue;
}
}
m.addTaxon(tx);
}
//
if (m.hasTaxons()) {
matches.add(m);
}
}
log.debug("FOUND LABELS count={}", matches.size());
return matches;
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class CSVFormatter method writeGeocodingResult.
@Override
public void writeGeocodingResult(ExtractionResult rowdata) {
HashMap<String, String> values = new HashMap<String, String>();
for (TextMatch m : rowdata.matches) {
values.clear();
if (fieldSet.contains(OpenSextantSchema.FILEPATH.getName())) {
values.put(OpenSextantSchema.FILEPATH.getName(), rowdata.recordFile);
}
buildRow(values, m);
try {
writer.write(values, header, outputSchema);
} catch (Exception err) {
log.error("Delayed error ERR:" + err.getLocalizedMessage());
}
}
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class TestGazMatcher method summarizeFindings.
public static void summarizeFindings(List<TextMatch> matches) {
Set<String> placeNames = new TreeSet<>();
Set<String> countryNames = new TreeSet<>();
Set<String> coordinates = new TreeSet<>();
System.out.println("MENTIONS ALL == " + matches.size());
for (TextMatch tm : matches) {
printGeoTags(tm);
if (tm instanceof PlaceCandidate) {
PlaceCandidate p = (PlaceCandidate) tm;
if (tm.isFilteredOut()) {
print("Filtered Out. Rules = " + p.getRules());
continue;
}
if (!p.getRules().isEmpty()) {
print("Rules = " + p.getRules());
}
if (p.isCountry) {
countryNames.add(p.getText());
} else if (p.getChosen() != null) {
print(String.format("\tgeocoded @ %s with conf=%d", p.getChosen(), p.getConfidence()));
ScoredPlace alt = p.getSecondChoice();
if (alt != null) {
print(String.format("\tgeocoded @ %s second place", alt));
}
placeNames.add(p.getText());
} else {
placeNames.add(p.getText());
}
} else if (tm.isFilteredOut()) {
System.out.println("\t(filtered out: " + tm.getText() + ")");
continue;
}
if (tm instanceof GeocoordMatch) {
GeocoordMatch geo = (GeocoordMatch) tm;
coordinates.add(geo.getText());
if (geo.getRelatedPlace() != null) {
System.out.println("Coordinate at place named " + geo.getRelatedPlace());
}
}
}
System.out.println("MENTIONS DISTINCT PLACES == " + placeNames.size());
System.out.println(placeNames);
System.out.println("MENTIONS COUNTRIES == " + countryNames.size());
System.out.println(countryNames);
System.out.println("MENTIONS COORDINATES == " + coordinates.size());
System.out.println(coordinates);
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class GISDataFormatter method writeGeocodingResult.
/**
* Implementation of adding info extraction/geocoding restults to GIS outputs.
*/
@Override
public void writeGeocodingResult(ExtractionResult rowdata) {
boolean error = false;
log.debug("Adding data for File {} Count={}", rowdata.recordFile, rowdata.matches.size());
for (TextMatch g : rowdata.matches) {
if (filterOut(g)) {
continue;
}
// Increment ID
id++;
// Only TextMatches that implement the Geocoding interface are
// allowed here:
Geocoding geocoding = getGeocoding(g);
if (geocoding == null) {
log.debug("Non-geo will be ignored: {}", g);
continue;
}
log.debug("Add {}#{}", id, g);
try {
for (Feature row : gisDataModel.buildRows(id, geocoding, g, rowdata.attributes, rowdata)) {
log.debug("FEATURE: {}", row);
this.os.write(row);
}
} catch (ConfigException fieldErr) {
if (!error) {
log.error("OUTPUTTER, ERR=" + fieldErr);
}
error = true;
}
}
}
use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.
the class TestPlaceGeocoder method tagFile.
public void tagFile(File f, String langid) throws IOException {
// Call as many times as you have documents...
//
TextInput in = new TextInput("test", FileUtility.readFile(f, "UTF-8"));
in.langid = langid;
try {
List<TextMatch> matches = geocoder.extract(in);
summarizeFindings(matches);
} catch (Exception procErr) {
procErr.printStackTrace();
}
}
Aggregations