Search in sources :

Example 91 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class EnglishFeatureExtractor method addWordLevelFeatures.

private static void addWordLevelFeatures(MutableInstance instance, List<Term> terms, Term focus) {
    String[] words = new String[terms.size()];
    for (int i = 0; i < terms.size(); i++) {
        Term term = terms.get(i);
        if (term.getText() != null)
            words[i] = term.getText().replaceAll("\\s+", "_");
        else
            words[i] = "-";
    }
    // UNIGRAM
    for (int i = 0; i < words.length; i++) {
        instance.addBinary(new Feature("UNIGRAM" + "." + words[i]));
    }
    // BIGRAM
    for (int i = 0; i < words.length - 1; i++) {
        instance.addBinary(new Feature("BIGRAM" + "." + words[i] + "-" + words[i + 1]));
    }
    // WH_WORD
    String question = "";
    for (Term term : terms) question += term.getText() + " ";
    question = question.trim();
    String whWord = null;
    // first look at sentence beginning
    for (String ptrn : whPtrns) {
        Matcher m = Pattern.compile("^" + ptrn + REST_PTRN).matcher(question);
        if (m.matches()) {
            whWord = m.group(1).toLowerCase().replaceAll("\\s+", "_");
            instance.addBinary(new Feature("WH_WORD" + "." + whWord));
            break;
        }
    }
    if (whWord == null) {
        // then look anywhere in the sentence
        for (String ptrn : whPtrns) {
            Matcher m = Pattern.compile(ptrn + REST_PTRN).matcher(question);
            if (m.find()) {
                whWord = m.group(1).toLowerCase().replaceAll("\\s+", "_");
                instance.addBinary(new Feature("WH_WORD" + "." + whWord));
                break;
            }
        }
    }
    // OF_HEAD
    if (focus == null)
        return;
    for (String word : OF_HEAD_WORDS) {
        Matcher m = Pattern.compile(word + "s? of " + focus.getText()).matcher(question);
        if (m.find()) {
            instance.addBinary(new Feature("OF_HEAD" + "." + word));
            break;
        }
    }
}
Also used : Matcher(java.util.regex.Matcher) Term(edu.cmu.lti.javelin.qa.Term) Feature(edu.cmu.minorthird.classify.Feature)

Example 92 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class FeatureExtractor method loadFile.

/**
     * Loads an array of edu.cmu.minorthird.classify.Example objects from the file
     * at the given location, using {@link #datasetExamplePattern} and
     * {@link #createExample(String) createExample}.
     * 
     * @param fileName the name of the dataset file
     */
public Example[] loadFile(String fileName) {
    List<Example> examples = new ArrayList<Example>();
    String data = FileUtil.readFile(fileName, "UTF-8");
    Matcher m = datasetExamplePattern.matcher(data);
    while (m.find()) {
        try {
            Example[] exampleArr = createExample(m.group());
            for (Example example : exampleArr) {
                examples.add(example);
                numLoaded++;
            }
        } catch (Exception e) {
            log.error("Error reading Example from file: ", e);
        }
    }
    return (Example[]) examples.toArray(new Example[examples.size()]);
}
Also used : Matcher(java.util.regex.Matcher) Example(edu.cmu.minorthird.classify.Example) ArrayList(java.util.ArrayList)

Example 93 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class KeywordExtractor method tokenizeWithSpaces.

/**
	 * A rule-based tokenizer used to extract keywords for a query. This
	 * tokenizer is conservative, e.g. it does not split "F16" or "1,000.00".
	 * 
	 * @param text text to tokenize
	 * @return string of space-delimited tokens
	 */
public static String tokenizeWithSpaces(String text) {
    String rep;
    Matcher m1 = DELIMS1.matcher(text);
    while (m1.find()) {
        rep = " " + m1.group(0) + " ";
        text = text.replace(m1.group(0), rep);
    }
    Matcher m2 = DELIMS2.matcher(text);
    while (m2.find()) {
        rep = m2.group(1) + " " + m2.group(2) + " " + m2.group(3);
        text = text.replace(m2.group(0), rep);
    }
    Matcher m3 = DELIMS3.matcher(text);
    if (m3.find()) {
        rep = " " + m3.group(0);
        text = text.substring(0, text.length() - 1) + rep;
    }
    text = text.replaceAll("\\s++", " ").trim();
    return text;
}
Also used : Matcher(java.util.regex.Matcher)

Example 94 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class PredicateExtractor method handleThing.

private static String handleThing(String qn, String verbMod, String[] ats, String[] tokens, String[] pos, String[] chunks, int i) {
    if (i + 1 == tokens.length || !chunks[i + 1].endsWith("-NP")) {
        // interrogative is not followed by a noun phrase
        // is the interrogative followed by an auxiliary verb that has been shifted in verbMod?
        boolean auxiliary = (i + 2 < tokens.length && !verbMod.matches(".*?" + phraseToRegex(tokens[i + 1] + " " + tokens[i + 2]) + ".*+")) ? true : false;
        // replace interrogative
        if (auxiliary)
            verbMod = verbMod.replaceFirst(THING_P, "") + " " + THING_R;
        else
            verbMod = verbMod.replaceFirst(THING_P, THING_R);
    } else {
        // interrogative is followed by noun phrases...
        // get interrogative + noun phrases
        String phrase = tokens[i];
        int j;
        for (j = i + 1; j < tokens.length; j++) if (!pos[j].startsWith("VB") && !chunks[j].endsWith("-VP") && !(j == tokens.length - 1 && pos[j].equals(".")))
            phrase += " " + tokens[j];
        else
            break;
        if (i == 0 || !chunks[i - 1].endsWith("-PP")) {
            // ...and not preceded by prepositions
            // replace phrase
            phrase = phraseToRegex(phrase);
            // special handling for certain answer types
            boolean replaced = false;
            for (String at : ats) if (at.startsWith("NEdate") || at.startsWith("NEtime")) {
                verbMod = verbMod.replaceFirst(phrase, "") + " " + DATE_TIME_R;
                replaced = true;
                break;
            } else if (at.startsWith("NElocation")) {
                verbMod = verbMod.replaceFirst(phrase, "") + " " + LOCATION_R;
                replaced = true;
                break;
            }
            // general case
            if (!replaced) {
                Matcher m = Pattern.compile(phrase).matcher(qn);
                if (m.find()) {
                    // is the phrase followed by an auxiliary verb that has been shifted in verbMod?
                    boolean auxiliary = (j + 1 < tokens.length && !verbMod.matches(".*?" + phraseToRegex(tokens[j] + " " + tokens[j + 1]) + ".*+")) ? true : false;
                    String replacement = m.group(0).replaceFirst(THING_P, UNKNOWN_R);
                    if (auxiliary)
                        verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement;
                    else
                        verbMod = verbMod.replaceFirst(phrase, replacement);
                }
            }
        } else {
            // get prepositions + interrogative + noun phrases
            for (j = i - 1; j >= 0; j--) if (chunks[j].endsWith("-PP"))
                phrase = tokens[j] + " " + phrase;
            else
                break;
            // replace phrase
            phrase = phraseToRegex(phrase);
            // special handling for certain answer types
            boolean replaced = false;
            for (String at : ats) if (at.startsWith("NEdate") || at.startsWith("NEtime")) {
                verbMod = verbMod.replaceFirst(phrase, "") + " " + DATE_TIME_R;
                replaced = true;
                break;
            } else if (at.startsWith("NElocation")) {
                verbMod = verbMod.replaceFirst(phrase, "") + " " + LOCATION_R;
                replaced = true;
                break;
            }
            // general case
            if (!replaced) {
                Matcher m = Pattern.compile(phrase).matcher(qn);
                if (m.find()) {
                    String replacement = m.group(0).replaceFirst(THING_P, UNKNOWN_R);
                    verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement;
                }
            }
        }
    }
    return verbMod;
}
Also used : Matcher(java.util.regex.Matcher)

Example 95 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class PredicateExtractor method handleIgnore.

// methods to replace phrases with interrogatives
private static String handleIgnore(String qn, String verbMod, String[] tokens, String[] pos, String[] chunks, int i) {
    // get phrase from word on ignore-list to next interrogative
    String phrase = tokens[i];
    int interrogative = 0;
    for (int j = i + 1; j < tokens.length; j++) {
        phrase += " " + tokens[j];
        if (tokens[j].matches(INTERROGATIVE_P)) {
            interrogative = j;
            break;
        }
    }
    if (interrogative > i + 1) {
        // is the interrogative followed by a verb?
        boolean verb = (interrogative + 1 < tokens.length && (pos[interrogative + 1].startsWith("VB") || chunks[interrogative + 1].endsWith("-VP"))) ? true : false;
        // replace phrase
        phrase = phraseToRegex(phrase);
        Matcher m = Pattern.compile(phrase).matcher(qn);
        if (m.find()) {
            String replacement = m.group(0).replaceFirst(IGNORE_P, UNKNOWN_R).replaceFirst(INTERROGATIVE_P, "");
            if (verb) {
                verbMod = verbMod.replaceFirst(phrase, replacement);
            } else {
                verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement;
            }
        }
    }
    return verbMod;
}
Also used : Matcher(java.util.regex.Matcher)

Aggregations

Matcher (java.util.regex.Matcher)12473 Pattern (java.util.regex.Pattern)5010 ArrayList (java.util.ArrayList)1516 IOException (java.io.IOException)904 HashMap (java.util.HashMap)565 File (java.io.File)487 Test (org.junit.Test)442 BufferedReader (java.io.BufferedReader)428 Map (java.util.Map)363 List (java.util.List)287 InputStreamReader (java.io.InputStreamReader)266 HashSet (java.util.HashSet)236 MalformedURLException (java.net.MalformedURLException)163 URL (java.net.URL)155 Date (java.util.Date)152 InputStream (java.io.InputStream)147 Field (java.lang.reflect.Field)130 PatternSyntaxException (java.util.regex.PatternSyntaxException)128 ParseException (java.text.ParseException)127 LinkedHashMap (java.util.LinkedHashMap)120