Search in sources :

Example 6 with TextInput

use of org.opensextant.data.TextInput in project Xponents by OpenSextant.

the class AbstractMapper method prepareInput.

/**
     * A common method for converting a Text object into an Xponents TextInput tuple.
     * The assumptions for this demonstration method are:
     * <ul>
     * <li>input is JSON data and can be parsed as such</li>
     * <li>JSON data contains a top level "text" field, which will be used for extraction.</li>
     * <li>record ID is the result of key.toString(), or if key is null, then use JSON get('id') </li>
     * </ul>
     * Caller can optionally set Language ID of text.
     * @param key record ID, optionally null.
     * @param textRecord a JSON formatted object.
     * @return TextInput pair.  
     */
protected static TextInput prepareInput(final Object key, final Text textRecord) {
    JSONObject obj = JSONObject.fromObject(textRecord.toString());
    if (!obj.containsKey("text")) {
        return null;
    }
    String text_id = null;
    if (key != null) {
        text_id = key.toString();
    } else {
        text_id = obj.getString("id");
    }
    String text = obj.getString("text");
    return new TextInput(text_id, text);
}
Also used : JSONObject(net.sf.json.JSONObject) TextInput(org.opensextant.data.TextInput)

Example 7 with TextInput

use of org.opensextant.data.TextInput in project Xponents by OpenSextant.

the class GeoTaggerMapper method map.

/**
     * 
     */
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
    ++counter;
    TextInput textObj = null;
    try {
        textObj = prepareInput(null, textRecord);
    } catch (java.lang.NullPointerException npe) {
        log.error("Failed on record {}", textRecord.toString().substring(0, 50));
    }
    if (textObj == null) {
        return;
    }
    /* LANG ID = 'ENGLISH',
         * If this is not true, then you need to add LangID to your metadata or detect it live
         */
    textObj.langid = "en";
    HashSet<String> dedup = new HashSet<>();
    try {
        List<TextMatch> matches = geocoder.extract(textObj);
        if (matches.isEmpty()) {
            return;
        }
        Text oid = new Text(textObj.id);
        /* NORMALIZE findings.
             * Reduce all matches, minimizing duplicates, removing whitespace, etc.
             *
             */
        int filtered = 0, duplicates = 0;
        for (TextMatch tm : matches) {
            /* DEDUPLICATE */
            if (dedup.contains(tm.getText())) {
                duplicates += 1;
                continue;
            }
            /* FILTER OUT NOISE */
            if (filterOutMatch(tm)) {
                continue;
            }
            /* FORMAT */
            JSONObject o = match2JSON(tm);
            dedup.add(tm.getText());
            Text matchOutput = new Text(o.toString());
            /* SERIALIZE GEOCODING */
            context.write(oid, matchOutput);
        }
        if (log.isTraceEnabled()) {
            log.trace("For key {}, found={}, junk filtered={}, duplicates={}", key.toString(), matches.size(), filtered, duplicates);
        }
    } catch (Exception err) {
        log.error("Error running geotagger", err);
    }
}
Also used : JSONObject(net.sf.json.JSONObject) Text(org.apache.hadoop.io.Text) TextMatch(org.opensextant.extraction.TextMatch) TextInput(org.opensextant.data.TextInput) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 8 with TextInput

use of org.opensextant.data.TextInput in project Xponents by OpenSextant.

the class KeywordTaggerMapper method map.

/**
     * 
     */
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
    ++counter;
    TextInput textObj = null;
    try {
        textObj = prepareInput(null, textRecord);
    } catch (java.lang.NullPointerException npe) {
        log.error("Failed on record {}", textRecord.toString().substring(0, 50));
    }
    if (textObj == null) {
        return;
    }
    textObj.langid = "en";
    Text oid = new Text(textObj.id);
    /* LANG ID = 'ENGLISH', 
         * If this is not true, then you need to add LangID to your metadata or detect it live 
         */
    HashSet<String> dedup = new HashSet<>();
    try {
        /*
             * Testing to see if XTax tagger operates in Hadoop job
             */
        List<TextMatch> matches = xtax.extract(textObj);
        if (matches.isEmpty()) {
            return;
        }
        /* NORMALIZE findings.
             * Reduce all matches, minimizing duplicates, removing whitespace, etc.
             * 
             */
        int filtered = 0, duplicates = 0;
        for (TextMatch tm : matches) {
            if (filterCrap(tm.getText())) {
                filtered += 1;
                continue;
            }
            if (dedup.contains(tm.getText())) {
                duplicates += 1;
                continue;
            }
            dedup.add(tm.getText());
            JSONObject o = match2JSON(tm);
            Text matchOutput = new Text(o.toString());
            context.write(oid, matchOutput);
        }
        if (log.isTraceEnabled()) {
            log.trace("For key {}, found={}, junk filtered={}, duplicates={}", key.toString(), matches.size(), filtered, duplicates);
        }
    } catch (Exception err) {
        log.error("Error running xtax", err);
    }
}
Also used : JSONObject(net.sf.json.JSONObject) Text(org.apache.hadoop.io.Text) TextMatch(org.opensextant.extraction.TextMatch) TextInput(org.opensextant.data.TextInput) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 9 with TextInput

use of org.opensextant.data.TextInput in project Xponents by OpenSextant.

the class TestPlaceGeocoder method main.

/**
     * see TestGazMatcher documentation
     * 
     * @param args
     */
public static void main(String[] args) {
    try {
        TestPlaceGeocoder tester = new TestPlaceGeocoder();
        try {
            if (args.length == 1) {
                tester.tagFile(new File(args[0]));
            } else if (args.length == 2) {
                TextInput t = new TextInput("test", args[1]);
                t.langid = args[0];
                tester.tagText(t);
            } else if (args.length == 3) {
                tester.tagFile(new File(args[2]), args[0]);
            } else {
                tester.tagEvaluation();
            }
        } catch (Exception err) {
            err.printStackTrace();
        }
        tester.cleanup();
        System.exit(0);
    } catch (Exception err) {
        err.printStackTrace();
    }
}
Also used : TextInput(org.opensextant.data.TextInput) File(java.io.File) ExtractionException(org.opensextant.extraction.ExtractionException) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Example 10 with TextInput

use of org.opensextant.data.TextInput in project Xponents by OpenSextant.

the class TestPlaceGeocoderLanguages method tagEvaluation.

/**
     * Language-specific parsing will involve more testing...
     * For now, just making it available is enough.
     * 
     * @throws IOException
     */
public void tagEvaluation() throws IOException {
    String[] textsAR = { "Mixed language text UAE place مدرسة الشيخة لطيفة بنت حمدان", "Hosp here عيادة بدر" };
    String[] textsCJK = { // google search, yields baike.com         
    "冀州市冀州镇刘家埝小学-河北省衡水冀州冀州市冀州镇刘家埝小学.", "Gazetteer entry in JPN スナモリ" };
    try {
        for (String t : textsAR) {
            print("TEST:\t" + t + "\n=====================");
            TextInput i = new TextInput("test", t);
            i.langid = TextUtils.arabicLang;
            List<TextMatch> matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\t\t\t Compare to Generic tagging:\n================");
            i.langid = null;
            matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\n");
        }
        for (String t : textsCJK) {
            print("TEST:\t" + t + "\n=====================");
            TextInput i = new TextInput("test", t);
            i.langid = TextUtils.chineseLang;
            List<TextMatch> matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\t\t\t Compare to Generic tagging:\n================");
            i.langid = null;
            matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\n");
        }
    } catch (Exception procErr) {
        procErr.printStackTrace();
    }
}
Also used : TextMatch(org.opensextant.extraction.TextMatch) TextInput(org.opensextant.data.TextInput) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Aggregations

TextInput (org.opensextant.data.TextInput)11 IOException (java.io.IOException)8 ConfigException (org.opensextant.ConfigException)8 TextMatch (org.opensextant.extraction.TextMatch)4 JSONObject (net.sf.json.JSONObject)3 ExtractionException (org.opensextant.extraction.ExtractionException)3 ParseException (java.text.ParseException)2 HashSet (java.util.HashSet)2 Text (org.apache.hadoop.io.Text)2 ExtractionResult (org.opensextant.extraction.ExtractionResult)2 ProcessingException (org.opensextant.processing.ProcessingException)2 File (java.io.File)1 SolrServerException (org.apache.solr.client.solrj.SolrServerException)1 JSONObject (org.json.JSONObject)1 Form (org.restlet.data.Form)1 Get (org.restlet.resource.Get)1 Post (org.restlet.resource.Post)1