Search in sources :

Example 16 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class KnowledgeAnnotator method getContent.

/**
	 * Extracts the relevant content of a question by resolving the group
	 * identifiers of the format <code>[group_no]</code> in the content string
	 * that corresponds to the matching pattern.
	 * 
	 * @return relevant content of the question
	 */
protected String getContent() {
    String content = qContents.get(index);
    Pattern p = Pattern.compile("\\[(\\d*)\\]");
    Matcher m = p.matcher(content);
    // replace all group IDs by the corresponding parts of the question
    while (m.find()) {
        int group = Integer.parseInt(m.group(1));
        content = content.replace(m.group(), matcher.group(group));
    }
    return content;
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher)

Example 17 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class WorldFactbookKA method doSearch.

/**
	 * Searches the World Factbook for country details and returns an array
	 * containing a single <code>Result</code> object or an empty array, if the
	 * search failed.
	 * 
	 * @return array containing a single <code>Result</code> or an empty array
	 */
protected Result[] doSearch() {
    try {
        // get country name and demanded information
        String[] content = getContent().split("#");
        String info = content[0];
        String country = content[1];
        // get URL of country web page
        String countryPage = countries.get(country.toLowerCase());
        if (countryPage == null)
            return new Result[0];
        URL page = new URL(URL + countryPage);
        // retrieve document
        BufferedReader in;
        String html = "";
        in = new BufferedReader(new InputStreamReader(page.openStream(), Charset.forName("iso-8859-1")));
        while (in.ready()) {
            html += in.readLine() + " ";
        }
        in.close();
        // extract information
        Pattern p = Pattern.compile("(?i).*" + info + ":</div>\\s*</td>" + "\\s*<td .*?>(.*?)</td>.*");
        Matcher m = p.matcher(html);
        if (m.matches()) {
            // extract sentence
            String sentence = SentenceExtractor.getSentencesFromHtml(m.group(1))[0];
            // create result from that sentence
            return getResult(sentence, page.toString());
        }
    } catch (Exception e) {
        // print search error message
        MsgPrinter.printSearchError(e);
    }
    // search failed
    return new Result[0];
}
Also used : Pattern(java.util.regex.Pattern) InputStreamReader(java.io.InputStreamReader) Matcher(java.util.regex.Matcher) BufferedReader(java.io.BufferedReader) URL(java.net.URL) IOException(java.io.IOException) Result(info.ephyra.search.Result)

Example 18 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class CorefResolver method resolvePronounsToTarget.

/**
	 * Resolves references ONLY to the target description. This method is called
	 * once for each factoid and list question in the series.
	 * 
	 * @param target
	 *            the question series including answers to previous questions
	 * @param next
	 *            the next question in the series to be answered
	 */
public static void resolvePronounsToTarget(TRECTarget target, int next) {
    String currentTarget = target.getCondensedTarget();
    TRECQuestion[] questions = target.getQuestions();
    String currentQuestionString = questions[next].getQuestionString();
    String temp = isTargetPerson(currentTarget);
    boolean personFlag = temp != null;
    String currentTargetPerson = currentTarget;
    if (personFlag) {
        currentTargetPerson = temp;
    }
    // genitive of current Target
    String currentTargetGen = null;
    String currentTargetPersonGen = null;
    // rest of the sentence after pronoun occured
    String rest = null;
    // tokenized target
    String[] tokens = OpenNLP.tokenize(currentTarget);
    // create genitive of currentTarget
    if (currentTarget.endsWith("s")) {
        currentTargetGen = currentTarget.concat("'");
    } else {
        currentTargetGen = currentTarget.concat("'s");
    }
    // create genitive of currentTargetPerson
    if (currentTargetPerson.endsWith("s")) {
        currentTargetPersonGen = currentTargetPerson.concat("'");
    } else {
        currentTargetPersonGen = currentTargetPerson.concat("'s");
    }
    //		Collection<String> nplist = find(parse(currentTargetGen), "NP").values();
    //		System.out.println("-->" + nplist + ": " + nplist.size());
    //		if (nplist.size() > 1) {
    //			return;
    //		}
    //		
    //		String max = currentTargetGen;
    //		
    //		for (String s : nplist) {
    //			String curr = unparse(s);
    //			 
    //			 if (curr.length() < max.length()) {
    //				 max = curr;
    //			 }
    //		}
    //		
    //		currentTargetGen = max;
    /*
		 * Resolve personal, possessive and demonstrative pronouns by the target
		 * as antecedent
		 */
    String firstPronoun = "";
    int firstIndex = Integer.MAX_VALUE;
    //		Matcher sgpers = singularThirdPersonPronounPattern
    //				.matcher(currentQuestionString);
    String[] splitSgpers = currentQuestionString.split(singularThirdPersonPronounString);
    int firstSgpers = splitSgpers[0].length();
    if (splitSgpers.length > 1 && firstSgpers < firstIndex) {
        firstPronoun = "sgpers";
        firstIndex = firstSgpers;
    }
    //		Matcher sgthing = singularThirdThingPronounPattern
    //				.matcher(currentQuestionString);
    String[] splitSgthing = currentQuestionString.split(singularThirdThingPronounString);
    int firstSgthing = splitSgthing[0].length();
    if (splitSgthing.length > 1 && firstSgthing < firstIndex) {
        firstPronoun = "sgthing";
        firstIndex = firstSgthing;
    }
    //		Matcher plpers = pluralThirdPersonPronounPattern
    //				.matcher(currentQuestionString);
    String[] splitPlpers = currentQuestionString.split(pluralThirdPersonPronounString);
    int firstPlpers = splitPlpers[0].length();
    if (splitPlpers.length > 1 && firstPlpers < firstIndex) {
        firstPronoun = "plpers";
        firstIndex = firstPlpers;
    }
    //		Matcher sgposs = singularThirdPersonPronounPatternGen
    //				.matcher(currentQuestionString);
    String[] splitSgposs = currentQuestionString.split(singularThirdPersonPronounStringGen);
    int firstSgposs = splitSgposs[0].length();
    if (splitSgposs.length > 1 && firstSgposs < firstIndex) {
        firstPronoun = "sgposs";
        firstIndex = firstSgposs;
    }
    //		Matcher sgthingposs = singularThirdThingPronounPatternGen
    //				.matcher(currentQuestionString);
    String[] splitSgthingposs = currentQuestionString.split(singularThirdThingPronounStringGen);
    int firstSgthingposs = splitSgthingposs[0].length();
    if (splitSgthingposs.length > 1 && firstSgthingposs < firstIndex) {
        firstPronoun = "sgthingposs";
        firstIndex = firstSgthingposs;
    }
    //		Matcher plposs = pluralThirdPersonPronounPatternGen
    //				.matcher(currentQuestionString);
    String[] splitPlposs = currentQuestionString.split(pluralThirdPersonPronounStringGen);
    int firstPlposs = splitPlposs[0].length();
    if (splitPlposs.length > 1 && firstPlposs < firstIndex) {
        firstPronoun = "plposs";
        firstIndex = firstPlposs;
    }
    Matcher her = singularThirdPersonPronounPatternAmb.matcher(currentQuestionString);
    String[] splitHer = currentQuestionString.split(singularThirdPersonPronounStringAmb);
    int firstHer = splitHer[0].length();
    if (splitHer.length > 1 && firstPlposs < firstIndex) {
        firstPronoun = "her";
        firstIndex = firstHer;
    }
    Matcher sgdem = singularDemPronounPattern.matcher(currentQuestionString);
    String[] splitSgdem = currentQuestionString.split(singularDemPronounString);
    int firstSgdem = splitSgdem[0].length();
    if (splitSgdem.length > 1 && firstSgdem < firstIndex) {
        firstPronoun = "sgdem";
        firstIndex = firstSgdem;
    }
    Matcher pldem = pluralDemPronounPattern.matcher(currentQuestionString);
    String[] splitPldem = currentQuestionString.split(pluralDemPronounString);
    int firstPldem = splitPldem[0].length();
    if (splitPldem.length > 1 && firstPldem < firstIndex) {
        firstPronoun = "pldem";
        firstIndex = firstPldem;
    }
    if (personFlag && firstPronoun.equals("sgposs")) {
        currentQuestionString = currentQuestionString.replaceFirst(singularThirdPersonPronounStringGen, currentTargetPersonGen);
    }
    if (firstPronoun.equals("sgthingposs")) {
        currentQuestionString = currentQuestionString.replaceFirst(singularThirdThingPronounStringGen, currentTargetGen);
    }
    if (firstPronoun.equals("plposs")) {
        currentQuestionString = currentQuestionString.replaceFirst(pluralThirdPersonPronounStringGen, currentTargetGen);
    }
    if (personFlag && firstPronoun.equals("her") && her.matches()) {
        rest = currentQuestionString.substring(currentQuestionString.indexOf(her.group(2)) + her.group(2).length() + 1).toLowerCase();
        String[] questionTokens = OpenNLP.tokenize(rest);
        String[] pos = OpenNLP.tagPos(questionTokens);
        // pronoun
        if (pos[0].equalsIgnoreCase("NN")) {
            currentQuestionString = currentQuestionString.replaceFirst(singularThirdPersonPronounStringAmb, currentTargetPersonGen);
        } else {
            currentQuestionString = currentQuestionString.replaceFirst(singularThirdPersonPronounStringAmb, currentTargetPerson);
        }
    }
    if (firstPronoun.equals("sgdem") && sgdem.matches()) {
        // check whether target contains the same word as the rest of the
        // question string
        rest = currentQuestionString.substring(currentQuestionString.indexOf(sgdem.group(2)) + sgdem.group(2).length() + 1).toLowerCase();
        for (int i = 0; i < tokens.length; i++) {
            if (rest.contains(tokens[i].toLowerCase())) {
                currentQuestionString = currentQuestionString.replaceFirst(" " + tokens[i].toLowerCase() + "\\b", "");
                currentQuestionString = currentQuestionString.replaceFirst("\\b" + tokens[i].toLowerCase() + " ", "");
                currentQuestionString = currentQuestionString.replaceFirst(singularDemPronounString, currentTarget);
            }
        }
        currentQuestionString = currentQuestionString.replaceFirst(singularDemPronounString, currentTargetGen);
    }
    if (firstPronoun.equals("pldem") && pldem.matches()) {
        // check whether target contains the same word as the rest of the
        // question string
        rest = currentQuestionString.substring(currentQuestionString.indexOf(pldem.group(2)) + pldem.group(2).length() + 1).toLowerCase();
        for (int i = 0; i < tokens.length; i++) {
            if (rest.contains(tokens[i].toLowerCase())) {
                currentQuestionString = currentQuestionString.replaceFirst(" " + tokens[i].toLowerCase() + "\\b", "");
                currentQuestionString = currentQuestionString.replaceFirst("\\b" + tokens[i].toLowerCase() + " ", "");
                currentQuestionString = currentQuestionString.replaceFirst(pluralDemPronounString, currentTarget);
            }
        }
        currentQuestionString = currentQuestionString.replaceFirst(pluralDemPronounString, currentTargetGen);
    }
    if (personFlag && firstPronoun.equals("sgpers")) {
        currentQuestionString = currentQuestionString.replaceFirst(singularThirdPersonPronounString, currentTargetPerson);
    }
    if (firstPronoun.equals("sgthing")) {
        currentQuestionString = currentQuestionString.replaceFirst(singularThirdThingPronounString, currentTarget);
    }
    if (firstPronoun.equals("plpers")) {
        currentQuestionString = currentQuestionString.replaceFirst(pluralThirdPersonPronounString, currentTarget);
    }
    questions[next].setQuestionString(currentQuestionString);
    MsgPrinter.printResolvedQuestion(currentQuestionString);
}
Also used : Matcher(java.util.regex.Matcher)

Example 19 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class CorefResolver method resolvePronouns.

/**
	 * Resolves references to the target description, previous questions or
	 * answers. This method is called once for each factoid and list question in
	 * the series.
	 * 
	 * @param target
	 *            the question series including answers to previous questions
	 * @param next
	 *            the next question in the series to be answered
	 */
public static void resolvePronouns(TRECTarget target, int next) {
    String currentTarget = target.getCondensedTarget();
    TRECQuestion[] questions = target.getQuestions();
    String currentQuestionString = questions[next].getQuestionString();
    // genitive of current Target
    String currentTargetGen = null;
    // rest of the sentence after pronoun occured
    String rest = null;
    // tokenized target
    String[] tokens = OpenNLP.tokenize(currentTarget);
    // expected answer type - 1: thing, 2: person
    int exp = 0;
    // is target a person?
    boolean targetPerson = false;
    // is target a thing? - Not used at the moment: Too many problems
    boolean targetThing = false;
    // create genitive of currentTarget
    if (currentTarget.endsWith("s")) {
        currentTargetGen = currentTarget.concat("'");
    } else {
        currentTargetGen = currentTarget.concat("'s");
    }
    // System.out.println("Target: "+currentTarget );
    // System.out.println(next+ "Original:"+currentQuestionString );
    String[] targetTypes = target.getTargetTypes();
    if ((targetTypes.length == 1) && (targetTypes[0] == "PERSON")) {
        targetPerson = true;
    }
    if ((targetTypes.length != 4) && ((targetTypes[0] != "PERSON") || (targetTypes[1] != "PERSON") || (targetTypes[2] != "PERSON"))) {
        targetThing = true;
    }
    /*
		 * Resolve personal, possessive and demonstrative pronouns by the target
		 * as antecedent
		 */
    Matcher sgpers = singularThirdPersonPronounPattern.matcher(currentQuestionString);
    Matcher sgthing = singularThirdThingPronounPattern.matcher(currentQuestionString);
    Matcher plpers = pluralThirdPersonPronounPattern.matcher(currentQuestionString);
    Matcher sgposs = singularThirdPersonPronounPatternGen.matcher(currentQuestionString);
    Matcher sgthingposs = singularThirdThingPronounPatternGen.matcher(currentQuestionString);
    Matcher plposs = pluralThirdPersonPronounPatternGen.matcher(currentQuestionString);
    Matcher her = singularThirdPersonPronounPatternAmb.matcher(currentQuestionString);
    Matcher sgdem = singularDemPronounPattern.matcher(currentQuestionString);
    Matcher pldem = pluralDemPronounPattern.matcher(currentQuestionString);
    if (sgposs.matches()) {
        exp = 2;
        // if targetType is a thing, do not use it
        if ((!targetThing) || (next == 0)) {
            currentQuestionString = currentQuestionString.replaceAll(singularThirdPersonPronounStringGen, currentTargetGen);
        } else {
            if (usePreviousAnswer(questions, next, exp) != null) {
                currentTarget = usePreviousAnswer(questions, next, exp);
            }
            currentQuestionString = currentQuestionString.replaceAll(singularThirdPersonPronounStringGen, currentTargetGen);
        }
    }
    if (sgthingposs.matches()) {
        exp = 1;
        // if targetType is a person, do not use it
        if ((!targetPerson) || (next == 0)) {
            currentQuestionString = currentQuestionString.replaceAll(singularThirdThingPronounStringGen, currentTargetGen);
        } else {
            if (usePreviousAnswer(questions, next, exp) != null) {
                currentTarget = usePreviousAnswer(questions, next, exp);
            }
            currentQuestionString = currentQuestionString.replaceAll(singularThirdThingPronounStringGen, currentTargetGen);
        }
    }
    if (plposs.matches()) {
        currentQuestionString = currentQuestionString.replaceAll(pluralThirdPersonPronounStringGen, currentTargetGen);
    }
    if (her.matches()) {
        rest = currentQuestionString.substring(currentQuestionString.indexOf(her.group(2)) + her.group(2).length() + 1).toLowerCase();
        String[] questionTokens = OpenNLP.tokenize(rest);
        String[] pos = OpenNLP.tagPos(questionTokens);
        // pronoun
        if (pos[0].equalsIgnoreCase("NN")) {
            currentQuestionString = currentQuestionString.replaceAll(singularThirdPersonPronounStringAmb, currentTargetGen);
        } else {
            currentQuestionString = currentQuestionString.replaceAll(singularThirdPersonPronounStringAmb, currentTarget);
        }
    }
    if (sgdem.matches()) {
        // check whether target contains the same word as the rest of the
        // question string
        rest = currentQuestionString.substring(currentQuestionString.indexOf(sgdem.group(2)) + sgdem.group(2).length() + 1).toLowerCase();
        for (int i = 0; i < tokens.length; i++) {
            if (rest.contains(tokens[i].toLowerCase())) {
                currentQuestionString = currentQuestionString.replaceAll(" " + tokens[i].toLowerCase() + "\\b", "");
                currentQuestionString = currentQuestionString.replaceAll("\\b" + tokens[i].toLowerCase() + " ", "");
                currentQuestionString = currentQuestionString.replaceAll(singularDemPronounString, currentTarget);
            }
        }
        currentQuestionString = currentQuestionString.replaceAll(singularDemPronounString, currentTargetGen);
    }
    if (pldem.matches()) {
        // check whether target contains the same word as the rest of the
        // question string
        rest = currentQuestionString.substring(currentQuestionString.indexOf(pldem.group(2)) + pldem.group(2).length() + 1).toLowerCase();
        for (int i = 0; i < tokens.length; i++) {
            if (rest.contains(tokens[i].toLowerCase())) {
                currentQuestionString = currentQuestionString.replaceAll(" " + tokens[i].toLowerCase() + "\\b", "");
                currentQuestionString = currentQuestionString.replaceAll("\\b" + tokens[i].toLowerCase() + " ", "");
                currentQuestionString = currentQuestionString.replaceAll(pluralDemPronounString, currentTarget);
            }
        }
        currentQuestionString = currentQuestionString.replaceAll(pluralDemPronounString, currentTargetGen);
    }
    if (sgpers.matches()) {
        exp = 2;
        // replaced
        if (!(checkPl(tokens)) || (targetThing) || (next == 0)) {
            currentQuestionString = currentQuestionString.replaceAll(singularThirdPersonPronounString, currentTarget);
        } else {
            if (usePreviousAnswer(questions, next, exp) != null) {
                currentTarget = usePreviousAnswer(questions, next, exp);
            }
            currentQuestionString = currentQuestionString.replaceAll(singularThirdPersonPronounString, currentTarget);
        }
    }
    if (sgthing.matches()) {
        exp = 1;
        // replaced and is a person
        if (!(checkPl(tokens)) || (targetPerson) || (next == 0)) {
            currentQuestionString = currentQuestionString.replaceAll(singularThirdThingPronounString, currentTarget);
        } else {
            if (usePreviousAnswer(questions, next, exp) != null) {
                currentTarget = usePreviousAnswer(questions, next, exp);
            }
            currentQuestionString = currentQuestionString.replaceAll(singularThirdThingPronounString, currentTarget);
        }
    }
    if (plpers.matches()) {
        currentQuestionString = currentQuestionString.replaceAll(pluralThirdPersonPronounString, currentTarget);
    }
    questions[next].setQuestionString(currentQuestionString);
    // System.out.println(next+ "Replaced:" +
    // questions[next].getQuestionString());
    // System.out.println("#########################################################");
    MsgPrinter.printResolvedQuestion(questions[next].getQuestionString());
}
Also used : Matcher(java.util.regex.Matcher)

Example 20 with Matcher

use of java.util.regex.Matcher in project lucida by claritylab.

the class CorefResolver method isTargetPerson.

private static String isTargetPerson(String currentTarget) {
    Matcher tgt = verifyTargetPattern.matcher(currentTarget);
    if (!tgt.matches()) {
        return null;
    }
    if (isAllUpper(currentTarget)) {
        return null;
    }
    String[] split = currentTarget.split("\\s+");
    int jc = 0;
    boolean flagUpper = true;
    for (String s : split) {
        char c = s.charAt(0);
        if (Character.isLowerCase(c)) {
            if (!flagUpper) {
                return null;
            }
            jc++;
        } else {
            flagUpper = false;
        }
    }
    if (flagUpper || jc > 1) {
        return null;
    }
    String temp = "";
    for (int i = jc; i < split.length; i++) {
        temp += " " + split[i];
    }
    return temp.substring(1);
}
Also used : Matcher(java.util.regex.Matcher)

Aggregations

Matcher (java.util.regex.Matcher)12640 Pattern (java.util.regex.Pattern)5059 ArrayList (java.util.ArrayList)1525 IOException (java.io.IOException)913 HashMap (java.util.HashMap)575 File (java.io.File)490 Test (org.junit.Test)448 BufferedReader (java.io.BufferedReader)433 Map (java.util.Map)369 List (java.util.List)292 InputStreamReader (java.io.InputStreamReader)268 HashSet (java.util.HashSet)237 MalformedURLException (java.net.MalformedURLException)164 URL (java.net.URL)157 Date (java.util.Date)153 InputStream (java.io.InputStream)148 Field (java.lang.reflect.Field)130 ParseException (java.text.ParseException)130 PatternSyntaxException (java.util.regex.PatternSyntaxException)128 LinkedHashMap (java.util.LinkedHashMap)122