Search in sources :

Example 1 with NormalizationException

use of org.opensextant.extraction.NormalizationException in project Xponents by OpenSextant.

the class XCoord method extract_coordinates.

/**
     * Limit the extraction to a particular family of coordinates. Diagnostic
     * messages appear in TextMatchResultSet only when debug = ON.
     *
     * @param text
     *            text to match
     * @param text_id
     *            id for text
     * @param family
     *            pattern family or XConstants.ALL_PATTERNS
     * @return TextMatchResultSet result set. If input is null, result set is
     *         null
     */
public TextMatchResult extract_coordinates(String text, String text_id, int family) {
    if (text == null) {
        return null;
    }
    int bufsize = text.length();
    TextMatchResult results = new TextMatchResult();
    results.result_id = text_id;
    results.matches = new ArrayList<TextMatch>();
    int patternsComplete = 0;
    int found = 0;
    for (RegexPattern repat : patterns.get_patterns()) {
        log.debug("pattern={}", repat.id);
        if (!repat.enabled) {
            log.debug("CFG pattern={} not enabled", repat.id);
            continue;
        }
        GeocoordPattern pat = (GeocoordPattern) repat;
        // To limit multiple use enable_XXXX()
        if (family != XConstants.ALL_PATTERNS && pat.cce_family_id != family) {
            log.debug("CFG pattern={} not requested", pat.id);
            continue;
        }
        Matcher match = pat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            GeocoordMatch coord = new GeocoordMatch();
            // MATCH METHOD aka Pattern ID aka CCE instance
            coord.pattern_id = pat.id;
            coord.cce_family_id = pat.cce_family_id;
            coord.cce_variant = pat.cce_variant;
            coord.start = match.start();
            coord.end = match.end();
            coord.setText(match.group());
            if ((RUNTIME_FLAGS & XConstants.CONTEXT_FILTERS_ON) > 0) {
                if (this.filterOutContext(text, coord.start)) {
                    log.debug("Filtered out noisy match, {} found by {}", coord.getText(), pat.id);
                    continue;
                }
            }
            // Normalize
            try {
                GeocoordNormalization.normalize_coordinate(coord, patterns.group_matches(pat, match));
            } catch (NormalizationException normErr) {
                if (debug) {
                    // Quietly ignore
                    results.message = "Parse error with '" + coord.getText() + "'";
                    log.error(results.message, normErr);
                }
                continue;
            }
            //
            if (GeocoordNormalization.filter_out(coord)) {
                if (debug) {
                    results.message = "Filtered out coordinate pattern=" + pat.id + " value='" + coord.getText() + "'";
                    log.info("Normalization Filter fired, MSG=" + results.message);
                }
                continue;
            }
            // Establish precision
            GeocoordNormalization.set_precision(coord);
            /**
                 * Caller may want to disable getContext operation here for
                 * short texts.... or for any use case. This is more helpful for
                 * longer texts with many annotations.
                 */
            if ((XCoord.RUNTIME_FLAGS & XConstants.FLAG_EXTRACT_CONTEXT) > 0) {
                // returns indices for two windows before and after match
                int[] slices = TextUtils.get_text_window(coord.start, coord.getLength(), bufsize, match_width);
                // This sets the context window before/after.
                //
                coord.setContext(// left l1 to left l2
                TextUtils.delete_eol(text.substring(slices[0], slices[1])), // right r1 to r2
                TextUtils.delete_eol(text.substring(slices[2], slices[3])));
            }
            set_match_id(coord, found);
            results.matches.add(coord);
            //
            if (coord.hasOtherIterpretations()) {
                for (GeocoordMatch m2 : coord.getOtherInterpretations()) {
                    // Other interpretations may have different coord text.
                    // String _c = m2.coord_text;
                    m2.copyMetadata(coord);
                    // Preserve coordinate text of interpretation.
                    // m2.coord_text = _c;
                    results.matches.add(m2);
                }
            }
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    // "pass" is the wrong idea. If no data was found
    // because there was no data, then it still passes.
    //
    results.pass = !results.matches.isEmpty();
    PatternManager.reduce_matches(results.matches);
    return results;
}
Also used : NormalizationException(org.opensextant.extraction.NormalizationException) RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Example 2 with NormalizationException

use of org.opensextant.extraction.NormalizationException in project Xponents by OpenSextant.

the class GeocoordNormalization method normalize_coordinate.

/**
     * The match object is normalized, setting the coord_text and other data
     * from parsing "text" and knowing which pattern family was matched.
     *
     * @param m match
     * @param groups fields
     * @throws NormalizationException
     */
public static void normalize_coordinate(GeocoordMatch m, Map<String, TextEntity> groups) throws NormalizationException {
    // Hoaky Java 6 issue:  REGEX does not use named groups, so here we map both the value to
    // a text/offset pair (in groups) and provide just the key/text pairs  (_elements)
    //
    Map<String, String> fieldValues = new HashMap<String, String>();
    for (String name : groups.keySet()) {
        TextEntity val = groups.get(name);
        fieldValues.put(name, val.getText());
    }
    //
    if (m.cce_family_id == XConstants.DD_PATTERN) {
        // get lat text
        // lon text -- remove whitespace from both
        // coord_text = lat + ' ' + lon
        // set lat, lon
        //
        // decDegLat, decDegLon, degSym, hemiLat, hemiLon
        //
        DMSOrdinate ddlat = new DMSOrdinate(groups, fieldValues, DMLAT, m.getText());
        DMSOrdinate ddlon = new DMSOrdinate(groups, fieldValues, DMLON, m.getText());
        // Yield a cooridnate-only version of text; "+42.4440 -102.3333"
        // preserving the innate precision given in the original text.
        //
        m.lat_text = ddlat.text;
        m.lon_text = ddlon.text;
        m.setSeparator(groups);
        m.setCoordinate(ddlat, ddlon);
        /**
             * DD filters enabled.
             *
             * To Disable: XCoord.RUNTIME_FLAGS XOR XConstants.DD_FILTERS_ON
             */
        if ((XCoord.RUNTIME_FLAGS & XConstants.DD_FILTERS_ON) > 0) {
            /**
                 * With FILTERS ON if lat/lon have no ALPHA hemisphere, i.e.,
                 * ENSW * and if lat/lon text for match has no COORD symbology
                 * then this is likely not a DD coordinate -- filter out.
                 */
            if (!ddlon.hemisphere.isAlpha() && !ddlat.hemisphere.isAlpha()) {
                if (!ddlat.hasSymbols()) {
                    m.setFilteredOut(true);
                }
            }
        } else {
            // DD filters OFF, so do not filter out
            m.setFilteredOut(!GeodeticUtility.validateCoordinate(m.getLatitude(), m.getLongitude()));
        }
        m.coord_text = m.lat_text + " " + m.lon_text;
    } else if (m.cce_family_id == XConstants.DM_PATTERN) {
        // get lat text
        // lon text -- remove whitespace from both
        // coord_text = lat + ' ' + lon
        // set lat, lon
        //
        DMSOrdinate dmlat = new DMSOrdinate(groups, fieldValues, DMLAT, m.getText());
        DMSOrdinate dmlon = new DMSOrdinate(groups, fieldValues, DMLON, m.getText());
        m.lat_text = dmlat.text;
        m.lon_text = dmlon.text;
        m.setSeparator(groups);
        m.setCoordinate(dmlat, dmlon);
        if (!m.isFilteredOut()) {
            m.setFilteredOut(m.evaluateInvalidDashes());
        }
        m.coord_text = m.lat_text + " " + m.lon_text;
    } else if (m.cce_family_id == XConstants.DMS_PATTERN) {
        // remove whitespace
        // set lat, lon
        //
        DMSOrdinate dmlat = new DMSOrdinate(groups, fieldValues, DMLAT, m.getText());
        DMSOrdinate dmlon = new DMSOrdinate(groups, fieldValues, DMLON, m.getText());
        m.lat_text = dmlat.text;
        m.lon_text = dmlon.text;
        m.setSeparator(groups);
        m.setCoordinate(dmlat, dmlon);
        if (!m.isFilteredOut()) {
            m.setFilteredOut(m.evaluateInvalidDashes());
        }
        m.coord_text = m.lat_text + " " + m.lon_text;
    } else if (m.cce_family_id == XConstants.MGRS_PATTERN) {
        // Capture the normalized coord text just to aid in reporting in
        // error situations
        //
        m.coord_text = TextUtils.delete_whitespace(m.getText());
        // TODO: make use of multiple answers.
        try {
            MGRS[] mgrs_candidates = MGRSParser.parseMGRS(m.getText(), m.coord_text, fieldValues);
            //
            if (mgrs_candidates != null) {
                MGRS mgrs = mgrs_candidates[0];
                m.coord_text = mgrs.toString();
                Geodetic2DPoint pt = mgrs.toGeodetic2DPoint();
                m.setLatitude(pt.getLatitudeAsDegrees());
                m.setLongitude(pt.getLongitudeAsDegrees());
                m.setBalanced(true);
                if (mgrs_candidates.length == 2) {
                    mgrs = mgrs_candidates[1];
                    GeocoordMatch m2 = new GeocoordMatch();
                    m2.copy(m);
                    m2.coord_text = mgrs.toString();
                    pt = mgrs.toGeodetic2DPoint();
                    m2.setLatitude(pt.getLatitudeAsDegrees());
                    m2.setLongitude(pt.getLongitudeAsDegrees());
                    // Really balanced?
                    m2.setBalanced(true);
                    m.addOtherInterpretation(m2);
                }
            }
        } catch (java.lang.IllegalArgumentException parseErr) {
            //.debug("Failed to parse MGRS pattern with text=" + m.getText() + " COORD?:"
            //        + m.coord_text, parseErr);
            // No normalization was possible as this match represents an invalid MGRS value
            //
            m.setFilteredOut(true);
        } catch (Exception err) {
            throw new NormalizationException("Failed to parse MGRS", err);
        }
    } else if (m.cce_family_id == XConstants.UTM_PATTERN) {
        m.coord_text = TextUtils.delete_whitespace(m.getText());
        try {
            UTM utm = UTMParser.parseUTM(m.coord_text, fieldValues);
            if (utm != null) {
                Geodetic2DPoint pt = utm.getGeodetic();
                m.setLatitude(pt.getLatitudeAsDegrees());
                m.setLongitude(pt.getLongitudeAsDegrees());
                m.coord_text = utm.toString();
            }
        } catch (java.lang.IllegalArgumentException parseErr) {
            throw new NormalizationException("Failed to parse UTM pattern with text=" + m.getText() + " COORD?:" + m.coord_text, parseErr);
        // No normalization done.
        } catch (Exception err) {
            throw new NormalizationException("Failed to parse UTM pattern", err);
        }
    }
}
Also used : HashMap(java.util.HashMap) TextEntity(org.opensextant.extraction.TextEntity) NormalizationException(org.opensextant.extraction.NormalizationException) UTM(org.opensextant.geodesy.UTM) NormalizationException(org.opensextant.extraction.NormalizationException) Geodetic2DPoint(org.opensextant.geodesy.Geodetic2DPoint) MGRS(org.opensextant.geodesy.MGRS)

Example 3 with NormalizationException

use of org.opensextant.extraction.NormalizationException in project Xponents by OpenSextant.

the class TestPoLi method main.

/**
     * Run a simple test.
     * 
     * @param args
     *            only one argument accepted: a text file input.
     */
public static void main(String[] args) {
    boolean debug = true;
    boolean systemTest = false;
    String testFile = null;
    String config = null;
    try {
        gnu.getopt.Getopt opts = new gnu.getopt.Getopt("Poli", args, "c:u:f");
        int c;
        while ((c = opts.getopt()) != -1) {
            switch(c) {
                case 'f':
                    System.out.println("\tSystem TESTS======= ");
                    systemTest = true;
                    break;
                case 'u':
                    testFile = opts.getOptarg();
                    System.out.println("\tUser TESTS======= FILE=" + testFile);
                    break;
                case 'c':
                    config = opts.getOptarg();
                    System.out.println("\tUser Patterns Configuration ======= FILE=" + config);
                    break;
                default:
                    TestPoLi.usage();
                    System.exit(1);
            }
        }
    } catch (Exception runErr) {
        runErr.printStackTrace();
        TestPoLi.usage();
        System.exit(1);
    }
    PatternsOfLife poli = null;
    try {
        // Use default config file.
        poli = new PatternsOfLife(debug);
        if (config == null) {
            // default
            poli.configure();
        } else {
            poli.configure(config);
        }
    } catch (ConfigException xerr) {
        xerr.printStackTrace();
        System.exit(-1);
    }
    try {
        TestPoLiReporter test = new TestPoLiReporter(poli);
        if (systemTest) {
            test.test();
        } else if (testFile != null) {
            test.testUserFile(testFile);
        }
    } catch (NormalizationException xerr) {
        xerr.printStackTrace();
    } catch (IOException ioerr) {
        ioerr.printStackTrace();
    }
}
Also used : ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) ConfigException(org.opensextant.ConfigException) NormalizationException(org.opensextant.extraction.NormalizationException) IOException(java.io.IOException) NormalizationException(org.opensextant.extraction.NormalizationException) PatternsOfLife(org.opensextant.extractors.poli.PatternsOfLife)

Aggregations

NormalizationException (org.opensextant.extraction.NormalizationException)3 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Matcher (java.util.regex.Matcher)1 ConfigException (org.opensextant.ConfigException)1 TextEntity (org.opensextant.extraction.TextEntity)1 TextMatch (org.opensextant.extraction.TextMatch)1 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)1 TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)1 PatternsOfLife (org.opensextant.extractors.poli.PatternsOfLife)1 Geodetic2DPoint (org.opensextant.geodesy.Geodetic2DPoint)1 MGRS (org.opensextant.geodesy.MGRS)1 UTM (org.opensextant.geodesy.UTM)1