use of org.opensextant.data.TextInput in project Xponents by OpenSextant.
the class AbstractMapper method prepareInput.
/**
* A common method for converting a Text object into an Xponents TextInput tuple.
* The assumptions for this demonstration method are:
* <ul>
* <li>input is JSON data and can be parsed as such</li>
* <li>JSON data contains a top level "text" field, which will be used for extraction.</li>
* <li>record ID is the result of key.toString(), or if key is null, then use JSON get('id') </li>
* </ul>
* Caller can optionally set Language ID of text.
* @param key record ID, optionally null.
* @param textRecord a JSON formatted object.
* @return TextInput pair.
*/
protected static TextInput prepareInput(final Object key, final Text textRecord) {
JSONObject obj = JSONObject.fromObject(textRecord.toString());
if (!obj.containsKey("text")) {
return null;
}
String text_id = null;
if (key != null) {
text_id = key.toString();
} else {
text_id = obj.getString("id");
}
String text = obj.getString("text");
return new TextInput(text_id, text);
}
use of org.opensextant.data.TextInput in project Xponents by OpenSextant.
the class GeoTaggerMapper method map.
/**
*
*/
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
++counter;
TextInput textObj = null;
try {
textObj = prepareInput(null, textRecord);
} catch (java.lang.NullPointerException npe) {
log.error("Failed on record {}", textRecord.toString().substring(0, 50));
}
if (textObj == null) {
return;
}
/* LANG ID = 'ENGLISH',
* If this is not true, then you need to add LangID to your metadata or detect it live
*/
textObj.langid = "en";
HashSet<String> dedup = new HashSet<>();
try {
List<TextMatch> matches = geocoder.extract(textObj);
if (matches.isEmpty()) {
return;
}
Text oid = new Text(textObj.id);
/* NORMALIZE findings.
* Reduce all matches, minimizing duplicates, removing whitespace, etc.
*
*/
int filtered = 0, duplicates = 0;
for (TextMatch tm : matches) {
/* DEDUPLICATE */
if (dedup.contains(tm.getText())) {
duplicates += 1;
continue;
}
/* FILTER OUT NOISE */
if (filterOutMatch(tm)) {
continue;
}
/* FORMAT */
JSONObject o = match2JSON(tm);
dedup.add(tm.getText());
Text matchOutput = new Text(o.toString());
/* SERIALIZE GEOCODING */
context.write(oid, matchOutput);
}
if (log.isTraceEnabled()) {
log.trace("For key {}, found={}, junk filtered={}, duplicates={}", key.toString(), matches.size(), filtered, duplicates);
}
} catch (Exception err) {
log.error("Error running geotagger", err);
}
}
use of org.opensextant.data.TextInput in project Xponents by OpenSextant.
the class KeywordTaggerMapper method map.
/**
*
*/
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
++counter;
TextInput textObj = null;
try {
textObj = prepareInput(null, textRecord);
} catch (java.lang.NullPointerException npe) {
log.error("Failed on record {}", textRecord.toString().substring(0, 50));
}
if (textObj == null) {
return;
}
textObj.langid = "en";
Text oid = new Text(textObj.id);
/* LANG ID = 'ENGLISH',
* If this is not true, then you need to add LangID to your metadata or detect it live
*/
HashSet<String> dedup = new HashSet<>();
try {
/*
* Testing to see if XTax tagger operates in Hadoop job
*/
List<TextMatch> matches = xtax.extract(textObj);
if (matches.isEmpty()) {
return;
}
/* NORMALIZE findings.
* Reduce all matches, minimizing duplicates, removing whitespace, etc.
*
*/
int filtered = 0, duplicates = 0;
for (TextMatch tm : matches) {
if (filterCrap(tm.getText())) {
filtered += 1;
continue;
}
if (dedup.contains(tm.getText())) {
duplicates += 1;
continue;
}
dedup.add(tm.getText());
JSONObject o = match2JSON(tm);
Text matchOutput = new Text(o.toString());
context.write(oid, matchOutput);
}
if (log.isTraceEnabled()) {
log.trace("For key {}, found={}, junk filtered={}, duplicates={}", key.toString(), matches.size(), filtered, duplicates);
}
} catch (Exception err) {
log.error("Error running xtax", err);
}
}
use of org.opensextant.data.TextInput in project Xponents by OpenSextant.
the class TestPlaceGeocoder method main.
/**
* see TestGazMatcher documentation
*
* @param args
*/
public static void main(String[] args) {
try {
TestPlaceGeocoder tester = new TestPlaceGeocoder();
try {
if (args.length == 1) {
tester.tagFile(new File(args[0]));
} else if (args.length == 2) {
TextInput t = new TextInput("test", args[1]);
t.langid = args[0];
tester.tagText(t);
} else if (args.length == 3) {
tester.tagFile(new File(args[2]), args[0]);
} else {
tester.tagEvaluation();
}
} catch (Exception err) {
err.printStackTrace();
}
tester.cleanup();
System.exit(0);
} catch (Exception err) {
err.printStackTrace();
}
}
use of org.opensextant.data.TextInput in project Xponents by OpenSextant.
the class TestPlaceGeocoderLanguages method tagEvaluation.
/**
* Language-specific parsing will involve more testing...
* For now, just making it available is enough.
*
* @throws IOException
*/
public void tagEvaluation() throws IOException {
String[] textsAR = { "Mixed language text UAE place مدرسة الشيخة لطيفة بنت حمدان", "Hosp here عيادة بدر" };
String[] textsCJK = { // google search, yields baike.com
"冀州市冀州镇刘家埝小学-河北省衡水冀州冀州市冀州镇刘家埝小学.", "Gazetteer entry in JPN スナモリ" };
try {
for (String t : textsAR) {
print("TEST:\t" + t + "\n=====================");
TextInput i = new TextInput("test", t);
i.langid = TextUtils.arabicLang;
List<TextMatch> matches = geocoder.extract(i);
summarizeFindings(matches);
print("\t\t\t Compare to Generic tagging:\n================");
i.langid = null;
matches = geocoder.extract(i);
summarizeFindings(matches);
print("\n");
}
for (String t : textsCJK) {
print("TEST:\t" + t + "\n=====================");
TextInput i = new TextInput("test", t);
i.langid = TextUtils.chineseLang;
List<TextMatch> matches = geocoder.extract(i);
summarizeFindings(matches);
print("\t\t\t Compare to Generic tagging:\n================");
i.langid = null;
matches = geocoder.extract(i);
summarizeFindings(matches);
print("\n");
}
} catch (Exception procErr) {
procErr.printStackTrace();
}
}
Aggregations