use of java.util.regex.Matcher in project lucida by claritylab.
the class EnglishFeatureExtractor method addWordLevelFeatures.
private static void addWordLevelFeatures(MutableInstance instance, List<Term> terms, Term focus) {
String[] words = new String[terms.size()];
for (int i = 0; i < terms.size(); i++) {
Term term = terms.get(i);
if (term.getText() != null)
words[i] = term.getText().replaceAll("\\s+", "_");
else
words[i] = "-";
}
// UNIGRAM
for (int i = 0; i < words.length; i++) {
instance.addBinary(new Feature("UNIGRAM" + "." + words[i]));
}
// BIGRAM
for (int i = 0; i < words.length - 1; i++) {
instance.addBinary(new Feature("BIGRAM" + "." + words[i] + "-" + words[i + 1]));
}
// WH_WORD
String question = "";
for (Term term : terms) question += term.getText() + " ";
question = question.trim();
String whWord = null;
// first look at sentence beginning
for (String ptrn : whPtrns) {
Matcher m = Pattern.compile("^" + ptrn + REST_PTRN).matcher(question);
if (m.matches()) {
whWord = m.group(1).toLowerCase().replaceAll("\\s+", "_");
instance.addBinary(new Feature("WH_WORD" + "." + whWord));
break;
}
}
if (whWord == null) {
// then look anywhere in the sentence
for (String ptrn : whPtrns) {
Matcher m = Pattern.compile(ptrn + REST_PTRN).matcher(question);
if (m.find()) {
whWord = m.group(1).toLowerCase().replaceAll("\\s+", "_");
instance.addBinary(new Feature("WH_WORD" + "." + whWord));
break;
}
}
}
// OF_HEAD
if (focus == null)
return;
for (String word : OF_HEAD_WORDS) {
Matcher m = Pattern.compile(word + "s? of " + focus.getText()).matcher(question);
if (m.find()) {
instance.addBinary(new Feature("OF_HEAD" + "." + word));
break;
}
}
}
use of java.util.regex.Matcher in project lucida by claritylab.
the class FeatureExtractor method loadFile.
/**
* Loads an array of edu.cmu.minorthird.classify.Example objects from the file
* at the given location, using {@link #datasetExamplePattern} and
* {@link #createExample(String) createExample}.
*
* @param fileName the name of the dataset file
*/
public Example[] loadFile(String fileName) {
List<Example> examples = new ArrayList<Example>();
String data = FileUtil.readFile(fileName, "UTF-8");
Matcher m = datasetExamplePattern.matcher(data);
while (m.find()) {
try {
Example[] exampleArr = createExample(m.group());
for (Example example : exampleArr) {
examples.add(example);
numLoaded++;
}
} catch (Exception e) {
log.error("Error reading Example from file: ", e);
}
}
return (Example[]) examples.toArray(new Example[examples.size()]);
}
use of java.util.regex.Matcher in project lucida by claritylab.
the class KeywordExtractor method tokenizeWithSpaces.
/**
* A rule-based tokenizer used to extract keywords for a query. This
* tokenizer is conservative, e.g. it does not split "F16" or "1,000.00".
*
* @param text text to tokenize
* @return string of space-delimited tokens
*/
public static String tokenizeWithSpaces(String text) {
String rep;
Matcher m1 = DELIMS1.matcher(text);
while (m1.find()) {
rep = " " + m1.group(0) + " ";
text = text.replace(m1.group(0), rep);
}
Matcher m2 = DELIMS2.matcher(text);
while (m2.find()) {
rep = m2.group(1) + " " + m2.group(2) + " " + m2.group(3);
text = text.replace(m2.group(0), rep);
}
Matcher m3 = DELIMS3.matcher(text);
if (m3.find()) {
rep = " " + m3.group(0);
text = text.substring(0, text.length() - 1) + rep;
}
text = text.replaceAll("\\s++", " ").trim();
return text;
}
use of java.util.regex.Matcher in project lucida by claritylab.
the class PredicateExtractor method handleThing.
private static String handleThing(String qn, String verbMod, String[] ats, String[] tokens, String[] pos, String[] chunks, int i) {
if (i + 1 == tokens.length || !chunks[i + 1].endsWith("-NP")) {
// interrogative is not followed by a noun phrase
// is the interrogative followed by an auxiliary verb that has been shifted in verbMod?
boolean auxiliary = (i + 2 < tokens.length && !verbMod.matches(".*?" + phraseToRegex(tokens[i + 1] + " " + tokens[i + 2]) + ".*+")) ? true : false;
// replace interrogative
if (auxiliary)
verbMod = verbMod.replaceFirst(THING_P, "") + " " + THING_R;
else
verbMod = verbMod.replaceFirst(THING_P, THING_R);
} else {
// interrogative is followed by noun phrases...
// get interrogative + noun phrases
String phrase = tokens[i];
int j;
for (j = i + 1; j < tokens.length; j++) if (!pos[j].startsWith("VB") && !chunks[j].endsWith("-VP") && !(j == tokens.length - 1 && pos[j].equals(".")))
phrase += " " + tokens[j];
else
break;
if (i == 0 || !chunks[i - 1].endsWith("-PP")) {
// ...and not preceded by prepositions
// replace phrase
phrase = phraseToRegex(phrase);
// special handling for certain answer types
boolean replaced = false;
for (String at : ats) if (at.startsWith("NEdate") || at.startsWith("NEtime")) {
verbMod = verbMod.replaceFirst(phrase, "") + " " + DATE_TIME_R;
replaced = true;
break;
} else if (at.startsWith("NElocation")) {
verbMod = verbMod.replaceFirst(phrase, "") + " " + LOCATION_R;
replaced = true;
break;
}
// general case
if (!replaced) {
Matcher m = Pattern.compile(phrase).matcher(qn);
if (m.find()) {
// is the phrase followed by an auxiliary verb that has been shifted in verbMod?
boolean auxiliary = (j + 1 < tokens.length && !verbMod.matches(".*?" + phraseToRegex(tokens[j] + " " + tokens[j + 1]) + ".*+")) ? true : false;
String replacement = m.group(0).replaceFirst(THING_P, UNKNOWN_R);
if (auxiliary)
verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement;
else
verbMod = verbMod.replaceFirst(phrase, replacement);
}
}
} else {
// get prepositions + interrogative + noun phrases
for (j = i - 1; j >= 0; j--) if (chunks[j].endsWith("-PP"))
phrase = tokens[j] + " " + phrase;
else
break;
// replace phrase
phrase = phraseToRegex(phrase);
// special handling for certain answer types
boolean replaced = false;
for (String at : ats) if (at.startsWith("NEdate") || at.startsWith("NEtime")) {
verbMod = verbMod.replaceFirst(phrase, "") + " " + DATE_TIME_R;
replaced = true;
break;
} else if (at.startsWith("NElocation")) {
verbMod = verbMod.replaceFirst(phrase, "") + " " + LOCATION_R;
replaced = true;
break;
}
// general case
if (!replaced) {
Matcher m = Pattern.compile(phrase).matcher(qn);
if (m.find()) {
String replacement = m.group(0).replaceFirst(THING_P, UNKNOWN_R);
verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement;
}
}
}
}
return verbMod;
}
use of java.util.regex.Matcher in project lucida by claritylab.
the class PredicateExtractor method handleIgnore.
// methods to replace phrases with interrogatives
private static String handleIgnore(String qn, String verbMod, String[] tokens, String[] pos, String[] chunks, int i) {
// get phrase from word on ignore-list to next interrogative
String phrase = tokens[i];
int interrogative = 0;
for (int j = i + 1; j < tokens.length; j++) {
phrase += " " + tokens[j];
if (tokens[j].matches(INTERROGATIVE_P)) {
interrogative = j;
break;
}
}
if (interrogative > i + 1) {
// is the interrogative followed by a verb?
boolean verb = (interrogative + 1 < tokens.length && (pos[interrogative + 1].startsWith("VB") || chunks[interrogative + 1].endsWith("-VP"))) ? true : false;
// replace phrase
phrase = phraseToRegex(phrase);
Matcher m = Pattern.compile(phrase).matcher(qn);
if (m.find()) {
String replacement = m.group(0).replaceFirst(IGNORE_P, UNKNOWN_R).replaceFirst(INTERROGATIVE_P, "");
if (verb) {
verbMod = verbMod.replaceFirst(phrase, replacement);
} else {
verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement;
}
}
}
return verbMod;
}
Aggregations