Search in sources :

Example 11 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class TestXTemporal method systemTests.

/**
     *
     */
public void systemTests() {
    log.info("=== SYSTEM TESTS START ===");
    // Enable select patterns:  disable all first, then enable pattern families.
    // xdt.disableAll();
    // xdt.match_MonDayYear(true);
    // xdt.match_DateTime(true);
    // xdt.match_MonDayYear(true);
    xdt.enableAll();
    try {
        TestXTemporalReporter tester = new TestXTemporalReporter("./results/xtemp_System.csv");
        for (PatternTestCase tst : xdt.getPatternManager().testcases) {
            TextMatchResult results = xdt.extract_dates(tst.text, tst.id);
            results.add_trace("Test Payload: " + tst.text);
            if (!results.evaluated) {
                continue;
            }
            log.info("=========SYSTEM TEST " + tst.id + " FOUND:" + (results.matches == null ? "NOTHING" : results.matches.size()));
            tester.save_result(results);
        }
        tester.close_report();
    } catch (Exception err) {
        log.error("Not finishing tests", err);
        return;
    }
    log.info("=== SYSTEM TESTS DONE ===");
}
Also used : PatternTestCase(org.opensextant.extractors.flexpat.PatternTestCase) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) ConfigException(org.opensextant.ConfigException)

Example 12 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class PatternsOfLife method extract_patterns.

/**
     * Extract patterns of a certain family from a block of text.
     *
     * @param text
     *            - data to process
     * @param text_id
     *            - identifier for the data
     * @param family
     *            - optional filter; to reuse the same PatManager but extract
     *            certain patterns only.
     *
     * @return PoliResult
     */
public TextMatchResult extract_patterns(String text, String text_id, String family) {
    TextMatchResult results = new TextMatchResult();
    results.result_id = text_id;
    results.matches = new ArrayList<TextMatch>();
    int bufsize = text.length();
    PoliMatch poliMatch = null;
    int found = 0;
    int patternsComplete = 0;
    for (RegexPattern repat : patterns.get_patterns()) {
        if (!repat.enabled) {
            continue;
        }
        if (family != null && !repat.id.startsWith(family)) {
            continue;
        }
        Matcher match = repat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            Map<String, String> fields = patterns.group_map(repat, match);
            if (repat.match_class == null) {
                poliMatch = new PoliMatch(fields, match.group());
            } else {
                try {
                    poliMatch = (PoliMatch) repat.match_class.newInstance();
                    poliMatch.setText(match.group());
                    poliMatch.setGroups(fields);
                } catch (InstantiationException classErr1) {
                    poliMatch = null;
                    log.error("Could not create... ", classErr1);
                } catch (IllegalAccessException classErr2) {
                    poliMatch = null;
                    log.error("Could not create... ", classErr2);
                }
            }
            if (poliMatch == null) {
                // This would have been thrown at init.
                log.error("Could not find pattern family for " + repat.id);
                continue;
            }
            poliMatch.setType(repat.family);
            poliMatch.pattern_id = repat.id;
            poliMatch.start = match.start();
            poliMatch.end = match.end();
            poliMatch.normalize();
            // Filter -- trivial filter is to filter out any coord that
            // cannot
            // TODO: Assess filters?
            // returns indices for window around text match
            int[] slices = TextUtils.get_text_window(poliMatch.start, bufsize, match_width);
            // left l1 to left l2
            poliMatch.setContext(TextUtils.delete_eol(text.substring(slices[0], slices[1])));
            set_match_id(poliMatch, found);
            results.matches.add(poliMatch);
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    results.pass = !results.matches.isEmpty();
    PoliPatternManager.reduce_matches(results.matches);
    return results;
}
Also used : RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Example 13 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class TestXCoord method focusedTests.

/**
     * Use for limited developmen testing.
     */
protected void focusedTests() {
    log.info("=== ADHOC TESTS ===");
    log.info("Trying some specific DD tests now:\n=========================");
    xcoord.match_DD(true);
    xcoord.match_DMS(true);
    xcoord.match_DM(true);
    xcoord.match_MGRS(false);
    xcoord.match_UTM(false);
    TextMatchResult results = null;
    // = xcoord.extract_coordinates("text before " + "17S 699999 3335554" + " and after", "UTM");
    boolean dd = true;
    boolean dms = true;
    boolean dm = true;
    boolean mgrs = false;
    boolean utm = false;
    //
    xcoord.match_MGRS(mgrs);
    String[] mgrstest = { "1 FEB 2013", "12 GMT 18", "12 ctf 4000", "04\nSMB800999", "12\nDTF\r7070", "12\rDTF\r7070", "12\n\rDTF\r7070", "7MAR13 1600", "17MAR13 1600", "17MAR13 2014", "17MAY13 2014", "17JUN13 2014", "17JUL13 2014", "17SEP13 2014", "17OCT13 2014", "17NOV13 2014", "17DEC13 2014", "17APR13 2014", "17AUG13 2014", "17JAN13 2014", "7JAN13 2001", "17 JAN 13 2014", "7 JAN 13 2001", // Fail -- too much whitespace.
    "04RAA80099\n\t1", // edge case, bare minimum.
    "12FTF82711", // edge case, bare minimum.
    "15 EST 2008", // edge case, bare minimum.
    "14 MRE\n\n 1445", // edge case, bare minimum.
    "4 jul 2008", // edge case, bare minimum.
    "10 Jan 1994", // edge case, bare minimum.
    "10 Jan 13", // no, this is the real bare minimum.
    "10 Jan 94", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 461136560", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 461103656", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 46110 3656", // 0-padded Northing/Easting?  7 4 or 0007 0004
    "38SMB 4611 03656", // even, but whitespace
    "38SMB 46110365 60", // even, but whitespace
    "38SMB 46110365\n60", // odd, and whitespace
    "38SMB 4611035\n60", // MGRS 01, 10JAN 200502
    "38 SMB 4611 3656", // MGRS 01, 10JAN 200502
    "42 RPR 4611 3656", // MGRS 01, 10JAN 200502
    "10 Jan 2005 02", "10 Jan 1995 02" };
    xcoord.match_DD(dd);
    String[] ddtest = { "N 49°2' 0'' / E 38°22' 0''", "1.718114°  44.699603°", "N34.445566° W078.112233°", "00 N 130 WA", "xxxxxxxxxxxxx-385331-17004121.1466dc9989b3545553c65ef91c14c0f3yyyyyyyyyyyyyyyyyyy", "-385331-17004121", "CAN-385331-17004121", "15S5E", //DD04
    "TARGET [1]  LATITUDE: +32.3345  LONGITUDE: -179.3412", //DD04
    "TARGET [1]  LATITUDE= +32.3345  LONGITUDE= -179.3412", "42.3N; 102.4W", "42.3 N; 102.4 W", "23.34N 88.22E", // DD01
    "N32.3345:W179.3412", // DD03
    "+32.3345:-179.3412", // DD03
    " 32.3345:-179.3412", // DD03
    " 32.3345°;-179.3412°", // DD03  leading 0 on lat;
    "032.3345°;-179.3412°", // DD01
    "N32.3345:W179.3412", // DD03  leading 0 on lat;
    "032.3345°N;-179.3412°W", // DD01
    "N32.3345:E179.3412", // DD02
    "32.3345N/179.3412E", // DD02
    "32.33N 179.34E" };
    xcoord.match_DMS(dms);
    xcoord.match_DM(dm);
    String[] dmtest = { "N 49°2' 0'' / E 38°22' 0''", "xxxxxxxxxxxxx-385331-17004121.1466dc9989b3545553c65ef91c14c0f3yyyyyyyyyyyyyyyyyyy", "-385331-17004121", "41º58'46\"N, 87º54'20\"W ", "Latitude: 41º58'46\"N, Longitude: 87º54'20\"W ", "15S5E", //"01-02-03-04 005-06-07-08",           
    " 79.22.333N, 100.22.333W", " N 01° 44' E 101° 22'", "+42 18.0 x -102 24.0", "42 DEG 18.0N 102 DEG 24.0W", "#TEST   DM      01b      01DEG 44 N 101DEG 44 E", "03bv  4218N 10224W", "03bv      42°18'N 102°24'W", "03bv      42° 18'N 102° 24'W", "N 01° 44' E 101° 22'", "1122N-00 11122W-00", "01DEG 44N 101DEG 44E", "42 9-00 N 102 6-00W", "N42 18-00 x W102 24-00", "N01° 44' 55.5\" E101° 22' 33.0\"", "N 01° 44' 55\" E 101° 22'33.0\"", "33-04-05 12:11:10", "31°24' 70°21'", // No HEMI
    "40°55'23.2\" 9°43'51.1\"", // with HEMI
    "-40°55'23.2\" +9°43'51.1\"", "42 9-00 N 102 6-00W;           ", "42 18-009 N 102 24-009W;        ", // No HEMI
    "08°29.067' 13°14.067'", "08°29.067'N 13°14.067'W", "08°29.067'N 113°14.067'W", "40°55'23.2\"N 9°43'51\"E", "42° 18' 00\" 102° 24' 00", "(42° 18' 00\" 102° 24' 00", "01° 44' 55.5\" 101° 22' 33.0\"", "77°55'33.22\"N 127°33'22.11\"W", "40:26:46.123N,79:56:55.000W", "43-04-30.2720N 073-34-58.4170W", "31 53 45.55N 54 16 38.99E", "42.18.009N x 102.24.003W", "42.18.009N 102.24.003W", "42.18.009 N x 102.24.003 W", "014455666N1012233444E", "N7922333W10022333", "01°44'55.5\"N 101°22'33.0\"E;", "N01°44'55.5\" E101°22'33.0\"", "4025131234N 12015191234W", // original
    "5113N 00425E", // original
    "27° 37' 45’’N, 82° 42' 10’’W", // single second hash sym
    "27° 37' 45’N, 82° 42' 10’W", // no lat/lon sep
    "27° 37' 45’’N 82° 42' 10’’W", // no min hash.
    "27° 37 45N, 82° 42 10W" };
    String[] utm_tests = { "12\n\t\nX\t\n245070175", "12\n\nX\n266070175", "12 X 266070175", "12X 266070 175" };
    xcoord.match_UTM(utm);
    int count = 0;
    List<String> tests = new ArrayList<String>();
    if (utm) {
        tests.addAll(Arrays.asList(utm_tests));
    }
    if (dd) {
        tests.addAll(Arrays.asList(ddtest));
    }
    if (dms || dm) {
        tests.addAll(Arrays.asList(dmtest));
    }
    if (mgrs) {
        tests.addAll(Arrays.asList(mgrstest));
    }
    for (String testcase : tests) {
        ++count;
        String test_id = "" + count;
        results = xcoord.extract_coordinates("text before " + testcase + " and after", test_id);
        log.info("TEST (" + count + ") " + testcase + " FOUND:" + (results.matches.isEmpty() ? "NOTHING" : results.matches.size()));
        if (results.matches != null) {
            for (TextMatch m : results.matches) {
                log.info("\t" + m.toString());
                GeocoordMatch g = (GeocoordMatch) m;
                log.info("\t" + g.formatLatitude() + ", " + g.formatLongitude());
            }
        }
    }
    log.info("=== ADHOC TESTS DONE ===");
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Aggregations

TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)13 IOException (java.io.IOException)7 TextMatch (org.opensextant.extraction.TextMatch)6 FileNotFoundException (java.io.FileNotFoundException)4 PatternTestCase (org.opensextant.extractors.flexpat.PatternTestCase)4 Matcher (java.util.regex.Matcher)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 ConfigException (org.opensextant.ConfigException)2 GeocoordTestCase (org.opensextant.extractors.xcoord.GeocoordTestCase)2 File (java.io.File)1 LineNumberReader (java.io.LineNumberReader)1 Test (org.junit.Test)1 NormalizationException (org.opensextant.extraction.NormalizationException)1 RegexPatternManager (org.opensextant.extractors.flexpat.RegexPatternManager)1 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)1 DateMatch (org.opensextant.extractors.xtemporal.DateMatch)1 CsvMapReader (org.supercsv.io.CsvMapReader)1