Search in sources :

Example 21 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class KeywordTaggerMapper method map.

/**
     * 
     */
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
    ++counter;
    TextInput textObj = null;
    try {
        textObj = prepareInput(null, textRecord);
    } catch (java.lang.NullPointerException npe) {
        log.error("Failed on record {}", textRecord.toString().substring(0, 50));
    }
    if (textObj == null) {
        return;
    }
    textObj.langid = "en";
    Text oid = new Text(textObj.id);
    /* LANG ID = 'ENGLISH', 
         * If this is not true, then you need to add LangID to your metadata or detect it live 
         */
    HashSet<String> dedup = new HashSet<>();
    try {
        /*
             * Testing to see if XTax tagger operates in Hadoop job
             */
        List<TextMatch> matches = xtax.extract(textObj);
        if (matches.isEmpty()) {
            return;
        }
        /* NORMALIZE findings.
             * Reduce all matches, minimizing duplicates, removing whitespace, etc.
             * 
             */
        int filtered = 0, duplicates = 0;
        for (TextMatch tm : matches) {
            if (filterCrap(tm.getText())) {
                filtered += 1;
                continue;
            }
            if (dedup.contains(tm.getText())) {
                duplicates += 1;
                continue;
            }
            dedup.add(tm.getText());
            JSONObject o = match2JSON(tm);
            Text matchOutput = new Text(o.toString());
            context.write(oid, matchOutput);
        }
        if (log.isTraceEnabled()) {
            log.trace("For key {}, found={}, junk filtered={}, duplicates={}", key.toString(), matches.size(), filtered, duplicates);
        }
    } catch (Exception err) {
        log.error("Error running xtax", err);
    }
}
Also used : JSONObject(net.sf.json.JSONObject) Text(org.apache.hadoop.io.Text) TextMatch(org.opensextant.extraction.TextMatch) TextInput(org.opensextant.data.TextInput) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 22 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestPlaceGeocoderLanguages method tagEvaluation.

/**
     * Language-specific parsing will involve more testing...
     * For now, just making it available is enough.
     * 
     * @throws IOException
     */
public void tagEvaluation() throws IOException {
    String[] textsAR = { "Mixed language text UAE place مدرسة الشيخة لطيفة بنت حمدان", "Hosp here عيادة بدر" };
    String[] textsCJK = { // google search, yields baike.com         
    "冀州市冀州镇刘家埝小学-河北省衡水冀州冀州市冀州镇刘家埝小学.", "Gazetteer entry in JPN スナモリ" };
    try {
        for (String t : textsAR) {
            print("TEST:\t" + t + "\n=====================");
            TextInput i = new TextInput("test", t);
            i.langid = TextUtils.arabicLang;
            List<TextMatch> matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\t\t\t Compare to Generic tagging:\n================");
            i.langid = null;
            matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\n");
        }
        for (String t : textsCJK) {
            print("TEST:\t" + t + "\n=====================");
            TextInput i = new TextInput("test", t);
            i.langid = TextUtils.chineseLang;
            List<TextMatch> matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\t\t\t Compare to Generic tagging:\n================");
            i.langid = null;
            matches = geocoder.extract(i);
            summarizeFindings(matches);
            print("\n");
        }
    } catch (Exception procErr) {
        procErr.printStackTrace();
    }
}
Also used : TextMatch(org.opensextant.extraction.TextMatch) TextInput(org.opensextant.data.TextInput) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Example 23 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestXTax method main.

/**
     * 
     *
     * @param args
     *            the arguments
     * @throws Exception
     *             the exception
     */
public static void main(String[] args) {
    TaxonMatcher tax = null;
    try {
        tax = new TaxonMatcher();
        // Find JRC entities that have this random id pattern ID='1*1' and are in Russian form.
        // 
        List<Taxon> results = tax.search("tag:jrc_id+1*1 AND tag:lang_id+ru");
        for (Taxon tx : results) {
            System.out.println("Found: " + getJRCTag(tx.tagset) + " = " + tx);
        }
        if (args.length > 0) {
            File f = new File(args[0]);
            String content = FileUtility.readFile(f, "UTF-8");
            List<TextMatch> findings = tax.extract(content);
            for (TextMatch tm : findings) {
                String type = "" + ((TaxonMatch) tm).getTaxons();
                System.out.println("Found: " + tm + "\n\t\t" + type);
            }
        }
    } catch (Exception err) {
        err.printStackTrace();
    } finally {
        tax.shutdown();
        System.exit(0);
    }
}
Also used : Taxon(org.opensextant.data.Taxon) TaxonMatcher(org.opensextant.extractors.xtax.TaxonMatcher) TextMatch(org.opensextant.extraction.TextMatch) File(java.io.File)

Example 24 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestXCoord method focusedTests.

/**
     * Use for limited developmen testing.
     */
protected void focusedTests() {
    log.info("=== ADHOC TESTS ===");
    log.info("Trying some specific DD tests now:\n=========================");
    xcoord.match_DD(true);
    xcoord.match_DMS(true);
    xcoord.match_DM(true);
    xcoord.match_MGRS(false);
    xcoord.match_UTM(false);
    TextMatchResult results = null;
    // = xcoord.extract_coordinates("text before " + "17S 699999 3335554" + " and after", "UTM");
    boolean dd = true;
    boolean dms = true;
    boolean dm = true;
    boolean mgrs = false;
    boolean utm = false;
    //
    xcoord.match_MGRS(mgrs);
    String[] mgrstest = { "1 FEB 2013", "12 GMT 18", "12 ctf 4000", "04\nSMB800999", "12\nDTF\r7070", "12\rDTF\r7070", "12\n\rDTF\r7070", "7MAR13 1600", "17MAR13 1600", "17MAR13 2014", "17MAY13 2014", "17JUN13 2014", "17JUL13 2014", "17SEP13 2014", "17OCT13 2014", "17NOV13 2014", "17DEC13 2014", "17APR13 2014", "17AUG13 2014", "17JAN13 2014", "7JAN13 2001", "17 JAN 13 2014", "7 JAN 13 2001", // Fail -- too much whitespace.
    "04RAA80099\n\t1", // edge case, bare minimum.
    "12FTF82711", // edge case, bare minimum.
    "15 EST 2008", // edge case, bare minimum.
    "14 MRE\n\n 1445", // edge case, bare minimum.
    "4 jul 2008", // edge case, bare minimum.
    "10 Jan 1994", // edge case, bare minimum.
    "10 Jan 13", // no, this is the real bare minimum.
    "10 Jan 94", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 461136560", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 461103656", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 46110 3656", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 4611 03656", // even, but whitespace
    "38SMB 46110365 60", // even, but whitespace
    "38SMB 46110365\n60", // odd, and whitespace
    "38SMB 4611035\n60", // MGRS 01, 10JAN 200502
    "38 SMB 4611 3656", // MGRS 01, 10JAN 200502
    "42 RPR 4611 3656", // MGRS 01, 10JAN 200502
    "10 Jan 2005 02", "10 Jan 1995 02" };
    xcoord.match_DD(dd);
    String[] ddtest = { "N 49°2' 0'' / E 38°22' 0''", "1.718114°  44.699603°", "N34.445566° W078.112233°", "00 N 130 WA", "xxxxxxxxxxxxx-385331-17004121.1466dc9989b3545553c65ef91c14c0f3yyyyyyyyyyyyyyyyyyy", "-385331-17004121", "CAN-385331-17004121", "15S5E", //DD04
    "TARGET [1]  LATITUDE: +32.3345  LONGITUDE: -179.3412", //DD04
    "TARGET [1]  LATITUDE= +32.3345  LONGITUDE= -179.3412", "42.3N; 102.4W", "42.3 N; 102.4 W", "23.34N 88.22E", // DD01
    "N32.3345:W179.3412", // DD03
    "+32.3345:-179.3412", // DD03
    " 32.3345:-179.3412", // DD03
    " 32.3345°;-179.3412°", // DD03  leading 0 on lat;
    "032.3345°;-179.3412°", // DD01
    "N32.3345:W179.3412", // DD03  leading 0 on lat;
    "032.3345°N;-179.3412°W", // DD01
    "N32.3345:E179.3412", // DD02
    "32.3345N/179.3412E", // DD02
    "32.33N 179.34E" };
    xcoord.match_DMS(dms);
    xcoord.match_DM(dm);
    String[] dmtest = { "N 49°2' 0'' / E 38°22' 0''", "xxxxxxxxxxxxx-385331-17004121.1466dc9989b3545553c65ef91c14c0f3yyyyyyyyyyyyyyyyyyy", "-385331-17004121", "41º58'46\"N, 87º54'20\"W ", "Latitude: 41º58'46\"N, Longitude: 87º54'20\"W ", "15S5E", //"01-02-03-04 005-06-07-08",           
    " 79.22.333N, 100.22.333W", " N 01° 44' E 101° 22'", "+42 18.0 x -102 24.0", "42 DEG 18.0N 102 DEG 24.0W", "#TEST   DM      01b      01DEG 44 N 101DEG 44 E", "03bv  4218N 10224W", "03bv      42°18'N 102°24'W", "03bv      42° 18'N 102° 24'W", "N 01° 44' E 101° 22'", "1122N-00 11122W-00", "01DEG 44N 101DEG 44E", "42 9-00 N 102 6-00W", "N42 18-00 x W102 24-00", "N01° 44' 55.5\" E101° 22' 33.0\"", "N 01° 44' 55\" E 101° 22'33.0\"", "33-04-05 12:11:10", "31°24' 70°21'", // No HEMI
    "40°55'23.2\" 9°43'51.1\"", // with HEMI
    "-40°55'23.2\" +9°43'51.1\"", "42 9-00 N 102 6-00W;           ", "42 18-009 N 102 24-009W;        ", // No HEMI
    "08°29.067' 13°14.067'", "08°29.067'N 13°14.067'W", "08°29.067'N 113°14.067'W", "40°55'23.2\"N 9°43'51\"E", "42° 18' 00\" 102° 24' 00", "(42° 18' 00\" 102° 24' 00", "01° 44' 55.5\" 101° 22' 33.0\"", "77°55'33.22\"N 127°33'22.11\"W", "40:26:46.123N,79:56:55.000W", "43-04-30.2720N 073-34-58.4170W", "31 53 45.55N 54 16 38.99E", "42.18.009N x 102.24.003W", "42.18.009N 102.24.003W", "42.18.009 N x 102.24.003 W", "014455666N1012233444E", "N7922333W10022333", "01°44'55.5\"N 101°22'33.0\"E;", "N01°44'55.5\" E101°22'33.0\"", "4025131234N 12015191234W", // original
    "5113N 00425E", // original
    "27° 37' 45’’N, 82° 42' 10’’W", // single second hash sym
    "27° 37' 45’N, 82° 42' 10’W", // no lat/lon sep
    "27° 37' 45’’N 82° 42' 10’’W", // no min hash.
    "27° 37 45N, 82° 42 10W" };
    String[] utm_tests = { "12\n\t\nX\t\n245070175", "12\n\nX\n266070175", "12 X 266070175", "12X 266070 175" };
    xcoord.match_UTM(utm);
    int count = 0;
    List<String> tests = new ArrayList<String>();
    if (utm) {
        tests.addAll(Arrays.asList(utm_tests));
    }
    if (dd) {
        tests.addAll(Arrays.asList(ddtest));
    }
    if (dms || dm) {
        tests.addAll(Arrays.asList(dmtest));
    }
    if (mgrs) {
        tests.addAll(Arrays.asList(mgrstest));
    }
    for (String testcase : tests) {
        ++count;
        String test_id = "" + count;
        results = xcoord.extract_coordinates("text before " + testcase + " and after", test_id);
        log.info("TEST (" + count + ") " + testcase + " FOUND:" + (results.matches.isEmpty() ? "NOTHING" : results.matches.size()));
        if (results.matches != null) {
            for (TextMatch m : results.matches) {
                log.info("\t" + m.toString());
                GeocoordMatch g = (GeocoordMatch) m;
                log.info("\t" + g.formatLatitude() + ", " + g.formatLongitude());
            }
        }
    }
    log.info("=== ADHOC TESTS DONE ===");
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Example 25 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestXTemporalReporter method save_result.

/**
     * @param results
     * @throws IOException
     */
public void save_result(TextMatchResult results) throws IOException {
    Map<String, Object> row = null;
    if (!results.matches.isEmpty()) {
        for (TextMatch tm : results.matches) {
            row = new HashMap<String, Object>();
            row.put(header[0], results.result_id);
            row.put(header[1], "PASS");
            DateMatch m = (DateMatch) tm;
            String msg = results.message;
            if (m.is_submatch) {
                msg += "; Is Submatch";
            }
            row.put(header[2], msg);
            row.put(header[3], m.pattern_id);
            row.put(header[4], m.getText());
            row.put(header[5], m.datenorm.toString());
            row.put(header[6], m.datenorm_text);
            row.put(header[7], m.resolution.toString());
            row.put(header[8], m.datenorm.getTime());
            row.put(header[9], m.start);
            report.write(row, header, xtempResultsSpec);
        }
    } else {
        row = new HashMap<String, Object>();
        row.put(header[0], results.result_id);
        row.put(header[1], "FAIL");
        row.put(header[2], results.get_trace());
        report.write(row, header, xtempResultsSpec);
    }
}
Also used : DateMatch(org.opensextant.extractors.xtemporal.DateMatch) TextMatch(org.opensextant.extraction.TextMatch)

Aggregations

TextMatch (org.opensextant.extraction.TextMatch)26 IOException (java.io.IOException)9 ConfigException (org.opensextant.ConfigException)8 TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)6 ArrayList (java.util.ArrayList)5 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)5 Taxon (org.opensextant.data.Taxon)4 TextInput (org.opensextant.data.TextInput)4 Matcher (java.util.regex.Matcher)3 JSONObject (org.json.JSONObject)3 ExtractionException (org.opensextant.extraction.ExtractionException)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 File (java.io.File)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 JSONObject (net.sf.json.JSONObject)2 Text (org.apache.hadoop.io.Text)2 JSONArray (org.json.JSONArray)2