Search in sources :

Example 6 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestPoLiReporter method testUserFile.

/**
     * Run patterns over a single file using a pre-configured PoLi. Use -c
     * config -u file test
     */
public void testUserFile(String f) throws IOException, NormalizationException {
    // poli.configure(new File(f));
    String fname = FilenameUtils.getBaseName(f);
    createResultsFile("results/test_" + fname + ".csv");
    // List<TextMatch> allResults = new ArrayList<>();
    log.info("TESTING FILE: " + f);
    for (PatternTestCase test : poli.getPatternManager().testcases) {
        log.info("TEST " + test.id);
        TextMatchResult results = poli.extract_patterns(test.text, test.id, test.family);
        if (results.evaluated && !results.matches.isEmpty()) {
            try {
                for (TextMatch m : results.matches) {
                    // log.debug("TEST " + test.id + " FOUND: " +
                    // m.toString());
                    Map<String, Object> row = createResultRow(test, m);
                    report.write(row, header, poliResultsSpec);
                }
            } catch (IOException ioerr) {
                log.error("Failed to write result for " + test.id, ioerr);
            }
        } else {
            log.info("TEST " + test.id + " STATUS: FAILED");
        }
    }
    String inputText = FileUtils.readFileToString(new File(f));
    poli.enableAll();
    String fileID = "FILE:" + fname;
    PatternTestCase fileTestCase = new PatternTestCase(fileID, "all", "(file text)");
    TextMatchResult results = poli.extract_patterns(inputText, fileID, null);
    if (results.evaluated && !results.matches.isEmpty()) {
        try {
            for (TextMatch m : results.matches) {
                // log.debug("TEST " + test.id + " FOUND: " +
                // m.toString());
                Map<String, Object> row = createResultRow(fileTestCase, m);
                report.write(row, header, poliResultsSpec);
            }
        } catch (IOException ioerr) {
            log.error("Failed to write result for " + fileID, ioerr);
        }
    } else {
        log.info("FILE TEST " + fileID + " STATUS: FAILED");
    }
    closeReport();
}
Also used : PatternTestCase(org.opensextant.extractors.flexpat.PatternTestCase) TextMatch(org.opensextant.extraction.TextMatch) IOException(java.io.IOException) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) File(java.io.File)

Example 7 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class XTemporal method extract_dates.

/**
     * A direct call to extract dates; which is useful for diagnostics and
     * development/testing.
     *
     * @param text
     * @param text_id
     * @return
     */
public TextMatchResult extract_dates(String text, String text_id) {
    TextMatchResult results = new TextMatchResult();
    results.matches = new ArrayList<TextMatch>();
    results.result_id = text_id;
    int found = 0;
    int patternsComplete = 0;
    for (RegexPattern pat : patterns.get_patterns()) {
        log.debug("pattern={}", pat.id);
        if (!pat.enabled) {
            // results.message = "pattern=" + pat.id + " not enabled. ";
            log.debug("CFG pattern={} not enabled.", pat.id);
            continue;
        }
        Matcher match = pat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            DateMatch dt = new DateMatch();
            dt.pattern_id = pat.id;
            dt.start = match.start();
            dt.end = match.end();
            dt.setText(match.group());
            try {
                DateNormalization.normalize_date(patterns.group_map(pat, match), dt);
                if (dt.datenorm == null) {
                    continue;
                }
                if ("YMD".equalsIgnoreCase(pat.family)) {
                    if (this.isDistantPastYMD(dt.datenorm)) {
                        continue;
                    }
                }
                dt.datenorm_text = DateNormalization.format_date(dt.datenorm);
                // Flags worth setting here.
                dt.isDistantPast = isDistantPast(dt.datenorm.getTime());
                dt.isFuture = isFuture(dt.datenorm.getTime());
                set_match_id(dt, found);
                results.pass = true;
            } catch (Exception err) {
                // Not a date.
                results.pass = false;
                continue;
            }
            results.matches.add(dt);
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    results.pass = !results.matches.isEmpty();
    PatternManager.reduce_matches(results.matches);
    return results;
}
Also used : RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) IOException(java.io.IOException)

Example 8 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestPoLiReporter method test.

/**
     * System tests
     */
public void test() throws IOException {
    poli.enableAll();
    createResultsFile("results/test_System.csv");
    // List<TextMatch> allResults = new ArrayList<>();
    log.info("TESTING ALL SYSTEM PATTERNS");
    for (PatternTestCase test : this.poli.getPatternManager().testcases) {
        log.info("TEST " + test.id);
        TextMatchResult results = this.poli.extract_patterns(test.text, test.id, test.family);
        if (results.evaluated && !results.matches.isEmpty()) {
            try {
                for (TextMatch m : results.matches) {
                    // log.debug("TEST " + test.id + " FOUND: " +
                    // m.toString());
                    Map<String, Object> row = createResultRow(test, m);
                    report.write(row, header, poliResultsSpec);
                }
            } catch (IOException ioerr) {
                log.error("Failed to write result for " + test.id, ioerr);
            }
        } else {
            Map<String, Object> row = createResultRow(test, null);
            report.write(row, header, poliResultsSpec);
            log.info("TEST " + test.id + " STATUS: FAILED");
        }
    }
    closeReport();
}
Also used : PatternTestCase(org.opensextant.extractors.flexpat.PatternTestCase) TextMatch(org.opensextant.extraction.TextMatch) IOException(java.io.IOException) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Example 9 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class TestXCoordReporter method save_result.

/**
     * Coordinate Test/Eval format
     *
     *
     * Result ID, CCE family, pattern ID, status, message // Reason for failure
     * Result ID, CCE family, pattern ID, status, Match ID, matchtext, lat, lon
     * etc. // Success implied by match
     *
     * @TODO: use TestCase here or rely on truth evaluation in Python
     * GeocoderEval?
     * @param t
     * @param results
     * @throws IOException
     */
public void save_result(GeocoordTestCase t, TextMatchResult results) throws IOException {
    Map<String, Object> row = null;
    if (!results.matches.isEmpty()) {
        //
        for (TextMatch tm : results.matches) {
            GeocoordMatch m = (GeocoordMatch) tm;
            if (!full_report && (m.is_submatch || m.is_duplicate)) {
                // Ignore submatches and duplicates
                continue;
            }
            row = createTestCase(t);
            row.put(header[6], results.result_id);
            row.put(header[7], (full_report & m.is_submatch) ? "IGNORE" : "PASS");
            String msg = results.message;
            if (m.is_submatch) {
                msg += "; Is Submatch";
            }
            row.put(header[8], msg);
            row.put(header[9], XConstants.get_CCE_family(m.cce_family_id));
            row.put(header[10], m.pattern_id);
            row.put(header[11], m.getText());
            row.put(header[12], "" + m.formatLatitude());
            row.put(header[13], "" + m.formatLongitude());
            String mgrs = "";
            try {
                mgrs = m.toMGRS();
            } catch (Exception err) {
            }
            row.put(header[14], mgrs);
            row.put(header[15], m.formatPrecision());
            row.put(header[16], new Long(m.start));
            report.write(row, header, xcoordResultsSpec);
        }
    } else {
        row = createTestCase(t);
        row.put(header[6], results.result_id);
        boolean expected_failure = false;
        if (t != null) {
            expected_failure = !t.true_positive;
        } else {
            // If the match message contains a test payload from the test cases
            //
            String test_status = results.get_trace().toUpperCase();
            expected_failure = test_status.contains("FAIL");
        }
        // True Negative -- you ignored one correctly
        row.put(header[7], expected_failure ? "PASS" : "FAIL");
        row.put(header[8], results.get_trace());
        report.write(row, header, xcoordResultsSpec);
    }
}
Also used : GeocoordMatch(org.opensextant.extractors.xcoord.GeocoordMatch) TextMatch(org.opensextant.extraction.TextMatch)

Example 10 with TextMatch

use of org.opensextant.extraction.TextMatch in project Xponents by OpenSextant.

the class PatternsOfLife method extract_patterns.

/**
     * Extract patterns of a certain family from a block of text.
     *
     * @param text
     *            - data to process
     * @param text_id
     *            - identifier for the data
     * @param family
     *            - optional filter; to reuse the same PatManager but extract
     *            certain patterns only.
     *
     * @return PoliResult
     */
public TextMatchResult extract_patterns(String text, String text_id, String family) {
    TextMatchResult results = new TextMatchResult();
    results.result_id = text_id;
    results.matches = new ArrayList<TextMatch>();
    int bufsize = text.length();
    PoliMatch poliMatch = null;
    int found = 0;
    int patternsComplete = 0;
    for (RegexPattern repat : patterns.get_patterns()) {
        if (!repat.enabled) {
            continue;
        }
        if (family != null && !repat.id.startsWith(family)) {
            continue;
        }
        Matcher match = repat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            Map<String, String> fields = patterns.group_map(repat, match);
            if (repat.match_class == null) {
                poliMatch = new PoliMatch(fields, match.group());
            } else {
                try {
                    poliMatch = (PoliMatch) repat.match_class.newInstance();
                    poliMatch.setText(match.group());
                    poliMatch.setGroups(fields);
                } catch (InstantiationException classErr1) {
                    poliMatch = null;
                    log.error("Could not create... ", classErr1);
                } catch (IllegalAccessException classErr2) {
                    poliMatch = null;
                    log.error("Could not create... ", classErr2);
                }
            }
            if (poliMatch == null) {
                // This would have been thrown at init.
                log.error("Could not find pattern family for " + repat.id);
                continue;
            }
            poliMatch.setType(repat.family);
            poliMatch.pattern_id = repat.id;
            poliMatch.start = match.start();
            poliMatch.end = match.end();
            poliMatch.normalize();
            // Filter -- trivial filter is to filter out any coord that
            // cannot
            // TODO: Assess filters?
            // returns indices for window around text match
            int[] slices = TextUtils.get_text_window(poliMatch.start, bufsize, match_width);
            // left l1 to left l2
            poliMatch.setContext(TextUtils.delete_eol(text.substring(slices[0], slices[1])));
            set_match_id(poliMatch, found);
            results.matches.add(poliMatch);
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    results.pass = !results.matches.isEmpty();
    PoliPatternManager.reduce_matches(results.matches);
    return results;
}
Also used : RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Aggregations

TextMatch (org.opensextant.extraction.TextMatch)26 IOException (java.io.IOException)9 ConfigException (org.opensextant.ConfigException)8 TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)6 ArrayList (java.util.ArrayList)5 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)5 Taxon (org.opensextant.data.Taxon)4 TextInput (org.opensextant.data.TextInput)4 Matcher (java.util.regex.Matcher)3 JSONObject (org.json.JSONObject)3 ExtractionException (org.opensextant.extraction.ExtractionException)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 PlaceCandidate (org.opensextant.extractors.geo.PlaceCandidate)3 TaxonMatch (org.opensextant.extractors.xtax.TaxonMatch)3 File (java.io.File)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 JSONObject (net.sf.json.JSONObject)2 Text (org.apache.hadoop.io.Text)2 JSONArray (org.json.JSONArray)2