Search in sources :

Example 1 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class XCoord method extract_coordinates.

/**
     * Limit the extraction to a particular family of coordinates. Diagnostic
     * messages appear in TextMatchResultSet only when debug = ON.
     *
     * @param text
     *            text to match
     * @param text_id
     *            id for text
     * @param family
     *            pattern family or XConstants.ALL_PATTERNS
     * @return TextMatchResultSet result set. If input is null, result set is
     *         null
     */
public TextMatchResult extract_coordinates(String text, String text_id, int family) {
    if (text == null) {
        return null;
    }
    int bufsize = text.length();
    TextMatchResult results = new TextMatchResult();
    results.result_id = text_id;
    results.matches = new ArrayList<TextMatch>();
    int patternsComplete = 0;
    int found = 0;
    for (RegexPattern repat : patterns.get_patterns()) {
        log.debug("pattern={}", repat.id);
        if (!repat.enabled) {
            log.debug("CFG pattern={} not enabled", repat.id);
            continue;
        }
        GeocoordPattern pat = (GeocoordPattern) repat;
        // To limit multiple use enable_XXXX()
        if (family != XConstants.ALL_PATTERNS && pat.cce_family_id != family) {
            log.debug("CFG pattern={} not requested", pat.id);
            continue;
        }
        Matcher match = pat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            GeocoordMatch coord = new GeocoordMatch();
            // MATCH METHOD aka Pattern ID aka CCE instance
            coord.pattern_id = pat.id;
            coord.cce_family_id = pat.cce_family_id;
            coord.cce_variant = pat.cce_variant;
            coord.start = match.start();
            coord.end = match.end();
            coord.setText(match.group());
            if ((RUNTIME_FLAGS & XConstants.CONTEXT_FILTERS_ON) > 0) {
                if (this.filterOutContext(text, coord.start)) {
                    log.debug("Filtered out noisy match, {} found by {}", coord.getText(), pat.id);
                    continue;
                }
            }
            // Normalize
            try {
                GeocoordNormalization.normalize_coordinate(coord, patterns.group_matches(pat, match));
            } catch (NormalizationException normErr) {
                if (debug) {
                    // Quietly ignore
                    results.message = "Parse error with '" + coord.getText() + "'";
                    log.error(results.message, normErr);
                }
                continue;
            }
            //
            if (GeocoordNormalization.filter_out(coord)) {
                if (debug) {
                    results.message = "Filtered out coordinate pattern=" + pat.id + " value='" + coord.getText() + "'";
                    log.info("Normalization Filter fired, MSG=" + results.message);
                }
                continue;
            }
            // Establish precision
            GeocoordNormalization.set_precision(coord);
            /**
                 * Caller may want to disable getContext operation here for
                 * short texts.... or for any use case. This is more helpful for
                 * longer texts with many annotations.
                 */
            if ((XCoord.RUNTIME_FLAGS & XConstants.FLAG_EXTRACT_CONTEXT) > 0) {
                // returns indices for two windows before and after match
                int[] slices = TextUtils.get_text_window(coord.start, coord.getLength(), bufsize, match_width);
                // This sets the context window before/after.
                //
                coord.setContext(// left l1 to left l2
                TextUtils.delete_eol(text.substring(slices[0], slices[1])), // right r1 to r2
                TextUtils.delete_eol(text.substring(slices[2], slices[3])));
            }
            set_match_id(coord, found);
            results.matches.add(coord);
            //
            if (coord.hasOtherIterpretations()) {
                for (GeocoordMatch m2 : coord.getOtherInterpretations()) {
                    // Other interpretations may have different coord text.
                    // String _c = m2.coord_text;
                    m2.copyMetadata(coord);
                    // Preserve coordinate text of interpretation.
                    // m2.coord_text = _c;
                    results.matches.add(m2);
                }
            }
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    // "pass" is the wrong idea. If no data was found
    // because there was no data, then it still passes.
    //
    results.pass = !results.matches.isEmpty();
    PatternManager.reduce_matches(results.matches);
    return results;
}
Also used : NormalizationException(org.opensextant.extraction.NormalizationException) RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Example 2 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class TestPoLiReporter method testUserFile.

/**
     * Run patterns over a single file using a pre-configured PoLi. Use -c
     * config -u file test
     */
public void testUserFile(String f) throws IOException, NormalizationException {
    // poli.configure(new File(f));
    String fname = FilenameUtils.getBaseName(f);
    createResultsFile("results/test_" + fname + ".csv");
    // List<TextMatch> allResults = new ArrayList<>();
    log.info("TESTING FILE: " + f);
    for (PatternTestCase test : poli.getPatternManager().testcases) {
        log.info("TEST " + test.id);
        TextMatchResult results = poli.extract_patterns(test.text, test.id, test.family);
        if (results.evaluated && !results.matches.isEmpty()) {
            try {
                for (TextMatch m : results.matches) {
                    // log.debug("TEST " + test.id + " FOUND: " +
                    // m.toString());
                    Map<String, Object> row = createResultRow(test, m);
                    report.write(row, header, poliResultsSpec);
                }
            } catch (IOException ioerr) {
                log.error("Failed to write result for " + test.id, ioerr);
            }
        } else {
            log.info("TEST " + test.id + " STATUS: FAILED");
        }
    }
    String inputText = FileUtils.readFileToString(new File(f));
    poli.enableAll();
    String fileID = "FILE:" + fname;
    PatternTestCase fileTestCase = new PatternTestCase(fileID, "all", "(file text)");
    TextMatchResult results = poli.extract_patterns(inputText, fileID, null);
    if (results.evaluated && !results.matches.isEmpty()) {
        try {
            for (TextMatch m : results.matches) {
                // log.debug("TEST " + test.id + " FOUND: " +
                // m.toString());
                Map<String, Object> row = createResultRow(fileTestCase, m);
                report.write(row, header, poliResultsSpec);
            }
        } catch (IOException ioerr) {
            log.error("Failed to write result for " + fileID, ioerr);
        }
    } else {
        log.info("FILE TEST " + fileID + " STATUS: FAILED");
    }
    closeReport();
}
Also used : PatternTestCase(org.opensextant.extractors.flexpat.PatternTestCase) TextMatch(org.opensextant.extraction.TextMatch) IOException(java.io.IOException) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) File(java.io.File)

Example 3 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class TestXCoord method systemTests.

/**
     * Using the TestUtility, all patterns are tested and reported to the
     * results folder.
     */
public void systemTests() {
    RegexPatternManager mgr = xcoord.getPatternManager();
    log.info("\n\n=== SYSTEM TESTS ===\n\n");
    if (!mgr.testing) {
        log.info("TESTING OFF -- TURN ON DEBUG in LOG4J");
        return;
    }
    xcoord.match_UTM(true);
    xcoord.match_MGRS(true);
    xcoord.match_DD(true);
    xcoord.match_DMS(true);
    xcoord.match_DM(true);
    try {
        TestXCoordReporter tester = new TestXCoordReporter("./results/xcoord_System.csv");
        for (PatternTestCase tst : mgr.testcases) {
            TextMatchResult results = xcoord.extract_coordinates(tst.text, tst.id, tst.family_id);
            results.add_trace("Test Payload: " + tst.text);
            if (!results.evaluated) {
                continue;
            }
            log.info("=========SYSTEM TEST " + tst.id + " FOUND:" + (results.matches.isEmpty() ? "NOTHING" : results.matches.size()));
            tester.save_result(null, results);
        }
        tester.close_report();
    } catch (Exception err) {
        log.error("Not finishing tests", err);
        return;
    }
    log.info("=== SYSTEM TESTS DONE ===");
}
Also used : PatternTestCase(org.opensextant.extractors.flexpat.PatternTestCase) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) RegexPatternManager(org.opensextant.extractors.flexpat.RegexPatternManager) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException)

Example 4 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class TestXCoord method fileTests.

/**
     *
     * @param file
     */
public void fileTests(String file) {
    log.info("\n\n=== TEXT FILE TESTS ===\n\n");
    TestXCoordReporter tester = null;
    String buffer = null;
    try {
        String _file = file.trim();
        buffer = FileUtility.readFile(_file);
        String fname = FilenameUtils.getBaseName(_file);
        tester = new TestXCoordReporter("./results/xcoord_" + fname + ".csv");
    } catch (IOException err) {
        log.error("Failed to open test file", err);
        return;
    }
    xcoord.enableAll();
    try {
        String jobid = TextUtils.text_id(buffer);
        log.info("Extract coordinates; All patterns enabled");
        TextMatchResult results = xcoord.extract_coordinates(buffer, jobid);
        tester.save_result(null, results);
    } catch (Exception err) {
        log.error("Failed to write report", err);
    }
    tester.close_report();
    log.info("=== TEXT FILE TESTS DONE ===");
}
Also used : IOException(java.io.IOException) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException)

Example 5 with TextMatchResult

use of org.opensextant.extractors.flexpat.TextMatchResult in project Xponents by OpenSextant.

the class TestXCoord method fileTestByLines.

/**
     * This will accomodate any test file that has at least the following style:
     *
     * FAMILY-XXX COORDINATE TEXT "FAIL"
     *
     * Where the first FAMILY token is
     *
     * @param coordfile
     */
public void fileTestByLines(String coordfile) {
    xcoord.match_UTM(true);
    xcoord.match_MGRS(true);
    xcoord.match_DD(true);
    xcoord.match_DMS(true);
    xcoord.match_DM(true);
    try {
        String _file = coordfile.trim();
        String fname = FilenameUtils.getBaseName(_file);
        TestXCoordReporter tester = new TestXCoordReporter("./results/xcoord_" + fname + "-lines.csv");
        java.io.LineNumberReader in = getLineReader(coordfile);
        String line = null;
        while ((line = in.readLine()) != null) {
            String text = line.trim();
            if (text.startsWith("#")) {
                continue;
            }
            if (text.isEmpty()) {
                continue;
            }
            String fam = find_family(line);
            int famx = XConstants.get_CCE_family(fam);
            if (famx == XConstants.UNK_PATTERN) {
                log.error("Unknown test pattern TEXT=" + text);
                continue;
            }
            GeocoordTestCase tst = new GeocoordTestCase("#" + in.getLineNumber(), fam, text);
            TextMatchResult results = xcoord.extract_coordinates(tst.text, tst.id);
            /**
                 * tst.family_id
                 */
            results.add_trace("Test Payload: " + tst.text);
            if (!results.evaluated) {
                continue;
            }
            log.info("=========FILE TEST " + tst.id + " FOUND:" + (results.matches.isEmpty() ? "NOTHING" : results.matches.size()));
            tester.save_result(tst, results);
        }
        tester.close_report();
        log.info("=== FILE TESTS DONE ===");
    } catch (Exception err) {
        log.error("TEST BY LINES", err);
    }
}
Also used : GeocoordTestCase(org.opensextant.extractors.xcoord.GeocoordTestCase) LineNumberReader(java.io.LineNumberReader) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException)

Aggregations

TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)13 IOException (java.io.IOException)7 TextMatch (org.opensextant.extraction.TextMatch)6 FileNotFoundException (java.io.FileNotFoundException)4 PatternTestCase (org.opensextant.extractors.flexpat.PatternTestCase)4 Matcher (java.util.regex.Matcher)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 ConfigException (org.opensextant.ConfigException)2 GeocoordTestCase (org.opensextant.extractors.xcoord.GeocoordTestCase)2 File (java.io.File)1 LineNumberReader (java.io.LineNumberReader)1 Test (org.junit.Test)1 NormalizationException (org.opensextant.extraction.NormalizationException)1 RegexPatternManager (org.opensextant.extractors.flexpat.RegexPatternManager)1 GeocoordMatch (org.opensextant.extractors.xcoord.GeocoordMatch)1 DateMatch (org.opensextant.extractors.xtemporal.DateMatch)1 CsvMapReader (org.supercsv.io.CsvMapReader)1