Search in sources :

Example 1 with RegexPattern

use of org.opensextant.extractors.flexpat.RegexPattern in project Xponents by OpenSextant.

the class XCoord method extract_coordinates.

/**
     * Limit the extraction to a particular family of coordinates. Diagnostic
     * messages appear in TextMatchResultSet only when debug = ON.
     *
     * @param text
     *            text to match
     * @param text_id
     *            id for text
     * @param family
     *            pattern family or XConstants.ALL_PATTERNS
     * @return TextMatchResultSet result set. If input is null, result set is
     *         null
     */
public TextMatchResult extract_coordinates(String text, String text_id, int family) {
    if (text == null) {
        return null;
    }
    int bufsize = text.length();
    TextMatchResult results = new TextMatchResult();
    results.result_id = text_id;
    results.matches = new ArrayList<TextMatch>();
    int patternsComplete = 0;
    int found = 0;
    for (RegexPattern repat : patterns.get_patterns()) {
        log.debug("pattern={}", repat.id);
        if (!repat.enabled) {
            log.debug("CFG pattern={} not enabled", repat.id);
            continue;
        }
        GeocoordPattern pat = (GeocoordPattern) repat;
        // To limit multiple use enable_XXXX()
        if (family != XConstants.ALL_PATTERNS && pat.cce_family_id != family) {
            log.debug("CFG pattern={} not requested", pat.id);
            continue;
        }
        Matcher match = pat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            GeocoordMatch coord = new GeocoordMatch();
            // MATCH METHOD aka Pattern ID aka CCE instance
            coord.pattern_id = pat.id;
            coord.cce_family_id = pat.cce_family_id;
            coord.cce_variant = pat.cce_variant;
            coord.start = match.start();
            coord.end = match.end();
            coord.setText(match.group());
            if ((RUNTIME_FLAGS & XConstants.CONTEXT_FILTERS_ON) > 0) {
                if (this.filterOutContext(text, coord.start)) {
                    log.debug("Filtered out noisy match, {} found by {}", coord.getText(), pat.id);
                    continue;
                }
            }
            // Normalize
            try {
                GeocoordNormalization.normalize_coordinate(coord, patterns.group_matches(pat, match));
            } catch (NormalizationException normErr) {
                if (debug) {
                    // Quietly ignore
                    results.message = "Parse error with '" + coord.getText() + "'";
                    log.error(results.message, normErr);
                }
                continue;
            }
            //
            if (GeocoordNormalization.filter_out(coord)) {
                if (debug) {
                    results.message = "Filtered out coordinate pattern=" + pat.id + " value='" + coord.getText() + "'";
                    log.info("Normalization Filter fired, MSG=" + results.message);
                }
                continue;
            }
            // Establish precision
            GeocoordNormalization.set_precision(coord);
            /**
                 * Caller may want to disable getContext operation here for
                 * short texts.... or for any use case. This is more helpful for
                 * longer texts with many annotations.
                 */
            if ((XCoord.RUNTIME_FLAGS & XConstants.FLAG_EXTRACT_CONTEXT) > 0) {
                // returns indices for two windows before and after match
                int[] slices = TextUtils.get_text_window(coord.start, coord.getLength(), bufsize, match_width);
                // This sets the context window before/after.
                //
                coord.setContext(// left l1 to left l2
                TextUtils.delete_eol(text.substring(slices[0], slices[1])), // right r1 to r2
                TextUtils.delete_eol(text.substring(slices[2], slices[3])));
            }
            set_match_id(coord, found);
            results.matches.add(coord);
            //
            if (coord.hasOtherIterpretations()) {
                for (GeocoordMatch m2 : coord.getOtherInterpretations()) {
                    // Other interpretations may have different coord text.
                    // String _c = m2.coord_text;
                    m2.copyMetadata(coord);
                    // Preserve coordinate text of interpretation.
                    // m2.coord_text = _c;
                    results.matches.add(m2);
                }
            }
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    // "pass" is the wrong idea. If no data was found
    // because there was no data, then it still passes.
    //
    results.pass = !results.matches.isEmpty();
    PatternManager.reduce_matches(results.matches);
    return results;
}
Also used : NormalizationException(org.opensextant.extraction.NormalizationException) RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Example 2 with RegexPattern

use of org.opensextant.extractors.flexpat.RegexPattern in project Xponents by OpenSextant.

the class XTemporal method extract_dates.

/**
     * A direct call to extract dates; which is useful for diagnostics and
     * development/testing.
     *
     * @param text
     * @param text_id
     * @return
     */
public TextMatchResult extract_dates(String text, String text_id) {
    TextMatchResult results = new TextMatchResult();
    results.matches = new ArrayList<TextMatch>();
    results.result_id = text_id;
    int found = 0;
    int patternsComplete = 0;
    for (RegexPattern pat : patterns.get_patterns()) {
        log.debug("pattern={}", pat.id);
        if (!pat.enabled) {
            // results.message = "pattern=" + pat.id + " not enabled. ";
            log.debug("CFG pattern={} not enabled.", pat.id);
            continue;
        }
        Matcher match = pat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            DateMatch dt = new DateMatch();
            dt.pattern_id = pat.id;
            dt.start = match.start();
            dt.end = match.end();
            dt.setText(match.group());
            try {
                DateNormalization.normalize_date(patterns.group_map(pat, match), dt);
                if (dt.datenorm == null) {
                    continue;
                }
                if ("YMD".equalsIgnoreCase(pat.family)) {
                    if (this.isDistantPastYMD(dt.datenorm)) {
                        continue;
                    }
                }
                dt.datenorm_text = DateNormalization.format_date(dt.datenorm);
                // Flags worth setting here.
                dt.isDistantPast = isDistantPast(dt.datenorm.getTime());
                dt.isFuture = isFuture(dt.datenorm.getTime());
                set_match_id(dt, found);
                results.pass = true;
            } catch (Exception err) {
                // Not a date.
                results.pass = false;
                continue;
            }
            results.matches.add(dt);
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    results.pass = !results.matches.isEmpty();
    PatternManager.reduce_matches(results.matches);
    return results;
}
Also used : RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult) IOException(java.io.IOException)

Example 3 with RegexPattern

use of org.opensextant.extractors.flexpat.RegexPattern in project Xponents by OpenSextant.

the class PatternsOfLife method extract_patterns.

/**
     * Extract patterns of a certain family from a block of text.
     *
     * @param text
     *            - data to process
     * @param text_id
     *            - identifier for the data
     * @param family
     *            - optional filter; to reuse the same PatManager but extract
     *            certain patterns only.
     *
     * @return PoliResult
     */
public TextMatchResult extract_patterns(String text, String text_id, String family) {
    TextMatchResult results = new TextMatchResult();
    results.result_id = text_id;
    results.matches = new ArrayList<TextMatch>();
    int bufsize = text.length();
    PoliMatch poliMatch = null;
    int found = 0;
    int patternsComplete = 0;
    for (RegexPattern repat : patterns.get_patterns()) {
        if (!repat.enabled) {
            continue;
        }
        if (family != null && !repat.id.startsWith(family)) {
            continue;
        }
        Matcher match = repat.regex.matcher(text);
        results.evaluated = true;
        while (match.find()) {
            ++found;
            Map<String, String> fields = patterns.group_map(repat, match);
            if (repat.match_class == null) {
                poliMatch = new PoliMatch(fields, match.group());
            } else {
                try {
                    poliMatch = (PoliMatch) repat.match_class.newInstance();
                    poliMatch.setText(match.group());
                    poliMatch.setGroups(fields);
                } catch (InstantiationException classErr1) {
                    poliMatch = null;
                    log.error("Could not create... ", classErr1);
                } catch (IllegalAccessException classErr2) {
                    poliMatch = null;
                    log.error("Could not create... ", classErr2);
                }
            }
            if (poliMatch == null) {
                // This would have been thrown at init.
                log.error("Could not find pattern family for " + repat.id);
                continue;
            }
            poliMatch.setType(repat.family);
            poliMatch.pattern_id = repat.id;
            poliMatch.start = match.start();
            poliMatch.end = match.end();
            poliMatch.normalize();
            // Filter -- trivial filter is to filter out any coord that
            // cannot
            // TODO: Assess filters?
            // returns indices for window around text match
            int[] slices = TextUtils.get_text_window(poliMatch.start, bufsize, match_width);
            // left l1 to left l2
            poliMatch.setContext(TextUtils.delete_eol(text.substring(slices[0], slices[1])));
            set_match_id(poliMatch, found);
            results.matches.add(poliMatch);
        }
        patternsComplete++;
        updateProgress(patternsComplete / (double) patterns.get_patterns().size() + 1);
    }
    results.pass = !results.matches.isEmpty();
    PoliPatternManager.reduce_matches(results.matches);
    return results;
}
Also used : RegexPattern(org.opensextant.extractors.flexpat.RegexPattern) Matcher(java.util.regex.Matcher) TextMatch(org.opensextant.extraction.TextMatch) TextMatchResult(org.opensextant.extractors.flexpat.TextMatchResult)

Aggregations

Matcher (java.util.regex.Matcher)3 TextMatch (org.opensextant.extraction.TextMatch)3 RegexPattern (org.opensextant.extractors.flexpat.RegexPattern)3 TextMatchResult (org.opensextant.extractors.flexpat.TextMatchResult)3 IOException (java.io.IOException)1 NormalizationException (org.opensextant.extraction.NormalizationException)1