use of edu.illinois.cs.cogcomp.comma.annotators.PreProcessor in project cogcomp-nlp by CogComp.
the class PrettyCorpusReader method readData.
private void readData(String annotationFileName) throws Exception {
PreProcessor preProcessor = new PreProcessor();
sentences = new ArrayList<>();
Map<String, TextAnnotation> taMap = getTAMap();
Scanner scanner;
scanner = new Scanner(new File(annotationFileName));
int count = 0;
int failures = 0, skipped = 0;
while (scanner.hasNext()) {
count++;
// wsj pentreebank sentence id
String textId = scanner.nextLine().trim();
assert textId.startsWith("wsj") : textId;
String[] tokens = scanner.nextLine().trim().split("\\s+");
String[] cleanedTokens = cleanTokens(tokens);
String blankLine = scanner.nextLine();
assert blankLine.trim().length() == 0 : String.format("line is not blank:%s", blankLine);
// should we skip this sentence due to some error?
boolean skip = false;
TextAnnotation goldTa = null, ta = null;
if (taMap.containsKey(textId)) {
goldTa = taMap.get(textId);
try {
ta = preProcessor.preProcess(Collections.singletonList(cleanedTokens));
} catch (Exception e) {
e.printStackTrace();
skip = true;
failures++;
}
} else {
System.out.println("No gold standard annotation available for:" + textId);
skip = true;
skipped++;
}
if (!skip) {
List<List<String>> commaLabels = new ArrayList<>();
for (String token : tokens) {
if (token.matches(",\\[.*\\]")) {
String labels = token.substring(2, token.length() - 1);
List<String> commaLabelsForIdx = Arrays.asList(labels.split(","));
commaLabels.add(commaLabelsForIdx);
} else if (token.equals(",")) {
// add null for commas which have not been annotated.
// The sentence constructor will optionally discard
// these or convert them to Other labels based on
// comma.properties.INCLUDE_NULL_LABEL_COMMAS
commaLabels.add(null);
}
}
try {
CommaSRLSentence sentence = new CommaSRLSentence(ta, goldTa, commaLabels);
sentences.add(sentence);
} catch (Exception e) {
e.printStackTrace();
failures++;
}
}
System.out.print(count);
if (skipped > 0)
System.out.print(" SKIPPED(" + skipped + ")");
if (failures > 0)
System.out.print(" ANNOTATION FAILED(" + failures + ")");
}
scanner.close();
}
use of edu.illinois.cs.cogcomp.comma.annotators.PreProcessor in project cogcomp-nlp by CogComp.
the class CommaLabeler method annotate.
public void annotate(String inFileName, String outFileName) throws Exception {
PreProcessor preProcessor = new PreProcessor();
BufferedReader reader = new BufferedReader(new FileReader(inFileName));
PrintWriter writer = new PrintWriter(outFileName, "UTF-8");
String line;
while ((line = reader.readLine()) != null) {
if (line.length() > 0) {
TextAnnotation ta = preProcessor.preProcess(line);
addView(ta);
writer.format("%s\n\n", commaViewToString(ta));
}
}
reader.close();
writer.close();
}
use of edu.illinois.cs.cogcomp.comma.annotators.PreProcessor in project cogcomp-nlp by CogComp.
the class CommaLabelerTest method testGetCommaSRLFromPlainText.
public void testGetCommaSRLFromPlainText() throws Exception {
PreProcessor preProcessor = new PreProcessor();
TextAnnotation ta = preProcessor.preProcess(untokenizedText);
PredicateArgumentView srlView = (PredicateArgumentView) annotator.getView(ta);
assertEquals(2, srlView.getPredicates().size());
Constituent pred1 = srlView.getPredicates().get(0);
assertEquals("Substitute", srlView.getPredicateSense(pred1));
assertEquals(2, srlView.getArguments(pred1).size());
assertEquals("Mary", srlView.getArguments(pred1).get(0).getTarget().getSurfaceForm());
Constituent pred2 = srlView.getPredicates().get(1);
assertEquals(1, srlView.getArguments(pred2).size());
assertEquals("LeftOfSubstitute", srlView.getArguments(pred2).get(0).getRelationName());
}
Aggregations