use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class SieveCoreferenceSystem method printConllOutput.
private static void printConllOutput(Document document, PrintWriter writer, List<List<Mention>> orderedMentions, boolean gold) {
Annotation anno = document.annotation;
List<List<String[]>> conllDocSentences = document.conllDoc.sentenceWordLists;
String docID = anno.get(CoreAnnotations.DocIDAnnotation.class);
StringBuilder sb = new StringBuilder();
sb.append("#begin document ").append(docID).append("\n");
List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
for (int sentNum = 0; sentNum < sentences.size(); sentNum++) {
List<CoreLabel> sentence = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
List<String[]> conllSentence = conllDocSentences.get(sentNum);
Map<Integer, Set<Mention>> mentionBeginOnly = Generics.newHashMap();
Map<Integer, Set<Mention>> mentionEndOnly = Generics.newHashMap();
Map<Integer, Set<Mention>> mentionBeginEnd = Generics.newHashMap();
for (int i = 0; i < sentence.size(); i++) {
mentionBeginOnly.put(i, new LinkedHashSet<>());
mentionEndOnly.put(i, new LinkedHashSet<>());
mentionBeginEnd.put(i, new LinkedHashSet<>());
}
for (Mention m : orderedMentions.get(sentNum)) {
if (m.startIndex == m.endIndex - 1) {
mentionBeginEnd.get(m.startIndex).add(m);
} else {
mentionBeginOnly.get(m.startIndex).add(m);
mentionEndOnly.get(m.endIndex - 1).add(m);
}
}
for (int i = 0; i < sentence.size(); i++) {
StringBuilder sb2 = new StringBuilder();
for (Mention m : mentionBeginOnly.get(i)) {
if (sb2.length() > 0) {
sb2.append("|");
}
int corefClusterId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
sb2.append("(").append(corefClusterId);
}
for (Mention m : mentionBeginEnd.get(i)) {
if (sb2.length() > 0) {
sb2.append("|");
}
int corefClusterId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
sb2.append("(").append(corefClusterId).append(")");
}
for (Mention m : mentionEndOnly.get(i)) {
if (sb2.length() > 0) {
sb2.append("|");
}
int corefClusterId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
sb2.append(corefClusterId).append(")");
}
if (sb2.length() == 0)
sb2.append("-");
String[] columns = conllSentence.get(i);
for (int j = 0; j < columns.length - 1; j++) {
String column = columns[j];
sb.append(column).append("\t");
}
sb.append(sb2).append("\n");
}
sb.append("\n");
}
sb.append("#end document").append("\n");
// sb.append("#end document ").append(docID).append("\n");
writer.print(sb.toString());
writer.flush();
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class MUCMentionExtractor method nextDoc.
@Override
public Document nextDoc() throws Exception {
List<List<CoreLabel>> allWords = new ArrayList<>();
List<Tree> allTrees = new ArrayList<>();
List<List<Mention>> allGoldMentions = new ArrayList<>();
List<List<Mention>> allPredictedMentions;
List<CoreMap> allSentences = new ArrayList<>();
Annotation docAnno = new Annotation("");
Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
Matcher docMatcher = docPattern.matcher(fileContents);
if (!docMatcher.find(currentOffset))
return null;
currentOffset = docMatcher.end();
String doc = docMatcher.group(1);
Matcher sentenceMatcher = sentencePattern.matcher(doc);
String ner = null;
//Maintain current document ID.
Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
Matcher docIDMatcher = docIDPattern.matcher(doc);
if (docIDMatcher.find())
currentDocumentID = docIDMatcher.group(1);
else
currentDocumentID = "documentAfter " + currentDocumentID;
while (sentenceMatcher.find()) {
String sentenceString = sentenceMatcher.group(2);
List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();
// FIXING TOKENIZATION PROBLEMS
for (int i = 0; i < words.size(); i++) {
CoreLabel w = words.get(i);
if (i > 0 && w.word().equals("$")) {
if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
continue;
words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
words.remove(i);
i--;
} else if (w.word().equals("\\/")) {
if (words.get(i - 1).word().equals("</COREF>"))
continue;
w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
words.remove(i + 1);
words.remove(i - 1);
}
}
// END FIXING TOKENIZATION PROBLEMS
List<CoreLabel> sentence = new ArrayList<>();
// MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
Stack<Mention> stack = new Stack<>();
List<Mention> mentions = new ArrayList<>();
allWords.add(sentence);
allGoldMentions.add(mentions);
for (CoreLabel word : words) {
String w = word.get(CoreAnnotations.TextAnnotation.class);
// found regular token: WORD/POS
if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
int i = w.lastIndexOf("\\/");
String w1 = w.substring(0, i);
// we do NOT set POS info here. We take the POS tags from the parser!
word.set(CoreAnnotations.TextAnnotation.class, w1);
word.remove(CoreAnnotations.OriginalTextAnnotation.class);
if (Constants.USE_GOLD_NE) {
if (ner != null) {
word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
} else {
word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
}
}
sentence.add(word);
} else // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
Pattern nerPattern = Pattern.compile("<(.*?)>");
Matcher m = nerPattern.matcher(w);
m.find();
ner = m.group(1);
} else // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
if (w.startsWith("</") && !w.startsWith("</COREF")) {
Pattern nerPattern = Pattern.compile("</(.*?)>");
Matcher m = nerPattern.matcher(w);
m.find();
String ner1 = m.group(1);
if (ner != null && !ner.equals(ner1))
throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
ner = null;
} else // found the start SGML tag for a coref mention
if (w.startsWith("<COREF")) {
Mention mention = new Mention();
// position of this mention in the sentence
mention.startIndex = sentence.size();
// extract GOLD info about this coref chain. needed for eval
Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");
Matcher m = idPattern.matcher(w);
m.find();
mention.mentionID = Integer.parseInt(m.group(1));
m = refPattern.matcher(w);
if (m.find()) {
mention.originalRef = Integer.parseInt(m.group(1));
}
// open mention. keep track of all open mentions using the stack
stack.push(mention);
} else // found the end SGML tag for a coref mention
if (w.equals("</COREF>")) {
Mention mention = stack.pop();
mention.endIndex = sentence.size();
// this is a closed mention. add it to the final list of mentions
// System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
mentions.add(mention);
} else {
word.remove(CoreAnnotations.OriginalTextAnnotation.class);
if (Constants.USE_GOLD_NE) {
if (ner != null) {
word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
} else {
word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
}
}
sentence.add(word);
}
}
StringBuilder textContent = new StringBuilder();
for (int i = 0; i < sentence.size(); i++) {
CoreLabel w = sentence.get(i);
w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
if (i > 0)
textContent.append(" ");
textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
}
CoreMap sentCoreMap = new Annotation(textContent.toString());
allSentences.add(sentCoreMap);
sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
}
// assign goldCorefClusterID
// temporary use
Map<Integer, Mention> idMention = Generics.newHashMap();
for (List<Mention> goldMentions : allGoldMentions) {
for (Mention m : goldMentions) {
idMention.put(m.mentionID, m);
}
}
for (List<Mention> goldMentions : allGoldMentions) {
for (Mention m : goldMentions) {
if (m.goldCorefClusterID == -1) {
if (m.originalRef == -1)
m.goldCorefClusterID = m.mentionID;
else {
int ref = m.originalRef;
while (true) {
Mention m2 = idMention.get(ref);
if (m2.goldCorefClusterID != -1) {
m.goldCorefClusterID = m2.goldCorefClusterID;
break;
} else if (m2.originalRef == -1) {
m2.goldCorefClusterID = m2.mentionID;
m.goldCorefClusterID = m2.goldCorefClusterID;
break;
} else {
ref = m2.originalRef;
}
}
}
}
}
}
docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
stanfordProcessor.annotate(docAnno);
if (allSentences.size() != allWords.size())
throw new IllegalStateException("allSentences != allWords");
for (int i = 0; i < allSentences.size(); i++) {
List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
List<CoreLabel> unannotatedSent = allWords.get(i);
List<Mention> mentionInSent = allGoldMentions.get(i);
for (Mention m : mentionInSent) {
m.dependency = allSentences.get(i).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
}
if (annotatedSent.size() != unannotatedSent.size()) {
throw new IllegalStateException("annotatedSent != unannotatedSent");
}
for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
CoreLabel annotatedWord = annotatedSent.get(j);
CoreLabel unannotatedWord = unannotatedSent.get(j);
if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class).equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
throw new IllegalStateException("annotatedWord != unannotatedWord");
}
}
allWords.set(i, annotatedSent);
allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
}
// extract predicted mentions
if (Constants.USE_GOLD_MENTIONS)
allPredictedMentions = allGoldMentions;
else
allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);
// add the relevant fields to mentions and order them for coref
return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class ACEMentionExtractor method nextDoc.
public Document nextDoc() throws Exception {
List<List<CoreLabel>> allWords = new ArrayList<>();
List<List<Mention>> allGoldMentions = new ArrayList<>();
List<List<Mention>> allPredictedMentions;
List<Tree> allTrees = new ArrayList<>();
Annotation anno;
try {
String filename = "";
while (files.length > fileIndex) {
if (files[fileIndex].contains("apf.xml")) {
filename = files[fileIndex];
fileIndex++;
break;
} else {
fileIndex++;
filename = "";
}
}
if (files.length <= fileIndex && filename.equals(""))
return null;
anno = aceReader.parse(corpusPath + filename);
stanfordProcessor.annotate(anno);
List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap s : sentences) {
int i = 1;
for (CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)) {
w.set(CoreAnnotations.IndexAnnotation.class, i++);
if (!w.containsKey(CoreAnnotations.UtteranceAnnotation.class)) {
w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
}
}
allTrees.add(s.get(TreeCoreAnnotations.TreeAnnotation.class));
allWords.add(s.get(CoreAnnotations.TokensAnnotation.class));
EntityComparator comparator = new EntityComparator();
extractGoldMentions(s, allGoldMentions, comparator);
}
if (Constants.USE_GOLD_MENTIONS)
allPredictedMentions = allGoldMentions;
else
allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries);
printRawDoc(sentences, allGoldMentions, filename, true);
printRawDoc(sentences, allPredictedMentions, filename, false);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
return arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method parse.
protected Tree parse(List<CoreLabel> tokens, List<ParserConstraint> constraints) {
CoreMap sent = new Annotation("");
sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
Annotation doc = new Annotation("");
List<CoreMap> sents = new ArrayList<>();
sents.add(sent);
doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
getParser().annotate(doc);
sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method parse.
/**
* Parses one file or directory with data from one domain
* @param path
* @throws IOException
*/
public final Annotation parse(String path) throws IOException {
// set below or exceptions
Annotation retVal;
try {
//
// this must return a dataset Annotation. each sentence in this dataset must contain:
// - TokensAnnotation
// - EntityMentionAnnotation
// - RelationMentionAnnotation
// - EventMentionAnnotation
// the other annotations (parse, NER) are generated in preProcessSentences
//
retVal = this.read(path);
} catch (Exception ex) {
IOException iox = new IOException();
iox.initCause(ex);
throw iox;
}
if (preProcessSentences) {
preProcessSentences(retVal);
if (MachineReadingProperties.trainUsePipelineNER) {
logger.severe("Changing NER tags using the CoreNLP pipeline.");
modifyUsingCoreNLPNER(retVal);
}
}
return retVal;
}
Aggregations