Search in sources :

Example 51 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class SieveCoreferenceSystem method printConllOutput.

private static void printConllOutput(Document document, PrintWriter writer, List<List<Mention>> orderedMentions, boolean gold) {
    Annotation anno = document.annotation;
    List<List<String[]>> conllDocSentences = document.conllDoc.sentenceWordLists;
    String docID = anno.get(CoreAnnotations.DocIDAnnotation.class);
    StringBuilder sb = new StringBuilder();
    sb.append("#begin document ").append(docID).append("\n");
    List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
    for (int sentNum = 0; sentNum < sentences.size(); sentNum++) {
        List<CoreLabel> sentence = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
        List<String[]> conllSentence = conllDocSentences.get(sentNum);
        Map<Integer, Set<Mention>> mentionBeginOnly = Generics.newHashMap();
        Map<Integer, Set<Mention>> mentionEndOnly = Generics.newHashMap();
        Map<Integer, Set<Mention>> mentionBeginEnd = Generics.newHashMap();
        for (int i = 0; i < sentence.size(); i++) {
            mentionBeginOnly.put(i, new LinkedHashSet<>());
            mentionEndOnly.put(i, new LinkedHashSet<>());
            mentionBeginEnd.put(i, new LinkedHashSet<>());
        }
        for (Mention m : orderedMentions.get(sentNum)) {
            if (m.startIndex == m.endIndex - 1) {
                mentionBeginEnd.get(m.startIndex).add(m);
            } else {
                mentionBeginOnly.get(m.startIndex).add(m);
                mentionEndOnly.get(m.endIndex - 1).add(m);
            }
        }
        for (int i = 0; i < sentence.size(); i++) {
            StringBuilder sb2 = new StringBuilder();
            for (Mention m : mentionBeginOnly.get(i)) {
                if (sb2.length() > 0) {
                    sb2.append("|");
                }
                int corefClusterId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
                sb2.append("(").append(corefClusterId);
            }
            for (Mention m : mentionBeginEnd.get(i)) {
                if (sb2.length() > 0) {
                    sb2.append("|");
                }
                int corefClusterId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
                sb2.append("(").append(corefClusterId).append(")");
            }
            for (Mention m : mentionEndOnly.get(i)) {
                if (sb2.length() > 0) {
                    sb2.append("|");
                }
                int corefClusterId = (gold) ? m.goldCorefClusterID : m.corefClusterID;
                sb2.append(corefClusterId).append(")");
            }
            if (sb2.length() == 0)
                sb2.append("-");
            String[] columns = conllSentence.get(i);
            for (int j = 0; j < columns.length - 1; j++) {
                String column = columns[j];
                sb.append(column).append("\t");
            }
            sb.append(sb2).append("\n");
        }
        sb.append("\n");
    }
    sb.append("#end document").append("\n");
    //    sb.append("#end document ").append(docID).append("\n");
    writer.print(sb.toString());
    writer.flush();
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CorefMention(edu.stanford.nlp.dcoref.CorefChain.CorefMention) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) ArrayList(java.util.ArrayList) List(java.util.List)

Example 52 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class MUCMentionExtractor method nextDoc.

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<>();
    List<Tree> allTrees = new ArrayList<>();
    List<List<Mention>> allGoldMentions = new ArrayList<>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<>();
    Annotation docAnno = new Annotation("");
    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;
    currentOffset = docMatcher.end();
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;
    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;
    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();
        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS
        List<CoreLabel> sentence = new ArrayList<>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<>();
        List<Mention> mentions = new ArrayList<>();
        allWords.add(sentence);
        allGoldMentions.add(mentions);
        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            } else // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            } else // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            } else // found the start SGML tag for a coref mention
            if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();
                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");
                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.parseInt(m.group(1));
                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.parseInt(m.group(1));
                }
                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            } else // found the end SGML tag for a coref mention
            if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();
                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }
    // assign goldCorefClusterID
    // temporary use
    Map<Integer, Mention> idMention = Generics.newHashMap();
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }
    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);
    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class).equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }
    // extract predicted mentions
    if (Constants.USE_GOLD_MENTIONS)
        allPredictedMentions = allGoldMentions;
    else
        allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);
    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}
Also used : Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) Tree(edu.stanford.nlp.trees.Tree) ArrayList(java.util.ArrayList) List(java.util.List) Pattern(java.util.regex.Pattern) Annotation(edu.stanford.nlp.pipeline.Annotation) Stack(java.util.Stack) CoreLabel(edu.stanford.nlp.ling.CoreLabel) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 53 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class ACEMentionExtractor method nextDoc.

public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<>();
    List<List<Mention>> allGoldMentions = new ArrayList<>();
    List<List<Mention>> allPredictedMentions;
    List<Tree> allTrees = new ArrayList<>();
    Annotation anno;
    try {
        String filename = "";
        while (files.length > fileIndex) {
            if (files[fileIndex].contains("apf.xml")) {
                filename = files[fileIndex];
                fileIndex++;
                break;
            } else {
                fileIndex++;
                filename = "";
            }
        }
        if (files.length <= fileIndex && filename.equals(""))
            return null;
        anno = aceReader.parse(corpusPath + filename);
        stanfordProcessor.annotate(anno);
        List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
        for (CoreMap s : sentences) {
            int i = 1;
            for (CoreLabel w : s.get(CoreAnnotations.TokensAnnotation.class)) {
                w.set(CoreAnnotations.IndexAnnotation.class, i++);
                if (!w.containsKey(CoreAnnotations.UtteranceAnnotation.class)) {
                    w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
                }
            }
            allTrees.add(s.get(TreeCoreAnnotations.TreeAnnotation.class));
            allWords.add(s.get(CoreAnnotations.TokensAnnotation.class));
            EntityComparator comparator = new EntityComparator();
            extractGoldMentions(s, allGoldMentions, comparator);
        }
        if (Constants.USE_GOLD_MENTIONS)
            allPredictedMentions = allGoldMentions;
        else
            allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries);
        printRawDoc(sentences, allGoldMentions, filename, true);
        printRawDoc(sentences, allPredictedMentions, filename, false);
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    return arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) ArrayList(java.util.ArrayList) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) Annotation(edu.stanford.nlp.pipeline.Annotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Tree(edu.stanford.nlp.trees.Tree) ArrayList(java.util.ArrayList) List(java.util.List) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 54 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class GenericDataSetReader method parse.

protected Tree parse(List<CoreLabel> tokens, List<ParserConstraint> constraints) {
    CoreMap sent = new Annotation("");
    sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
    Annotation doc = new Annotation("");
    List<CoreMap> sents = new ArrayList<>();
    sents.add(sent);
    doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
    getParser().annotate(doc);
    sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
    return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
}
Also used : ParserAnnotations(edu.stanford.nlp.parser.common.ParserAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) ArrayList(java.util.ArrayList) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) Annotation(edu.stanford.nlp.pipeline.Annotation)

Example 55 with Annotation

use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.

the class GenericDataSetReader method parse.

/**
   * Parses one file or directory with data from one domain
   * @param path
   * @throws IOException
   */
public final Annotation parse(String path) throws IOException {
    // set below or exceptions
    Annotation retVal;
    try {
        //
        // this must return a dataset Annotation. each sentence in this dataset must contain:
        // - TokensAnnotation
        // - EntityMentionAnnotation
        // - RelationMentionAnnotation
        // - EventMentionAnnotation
        // the other annotations (parse, NER) are generated in preProcessSentences
        //
        retVal = this.read(path);
    } catch (Exception ex) {
        IOException iox = new IOException();
        iox.initCause(ex);
        throw iox;
    }
    if (preProcessSentences) {
        preProcessSentences(retVal);
        if (MachineReadingProperties.trainUsePipelineNER) {
            logger.severe("Changing NER tags using the CoreNLP pipeline.");
            modifyUsingCoreNLPNER(retVal);
        }
    }
    return retVal;
}
Also used : IOException(java.io.IOException) Annotation(edu.stanford.nlp.pipeline.Annotation) IOException(java.io.IOException)

Aggregations

Annotation (edu.stanford.nlp.pipeline.Annotation)91 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)58 CoreMap (edu.stanford.nlp.util.CoreMap)50 CoreLabel (edu.stanford.nlp.ling.CoreLabel)30 StanfordCoreNLP (edu.stanford.nlp.pipeline.StanfordCoreNLP)27 ArrayList (java.util.ArrayList)25 Properties (java.util.Properties)25 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)19 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)14 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)13 SentencesAnnotation (edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation)12 TreeAnnotation (edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation)12 List (java.util.List)11 Tree (edu.stanford.nlp.trees.Tree)10 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)8 IOException (java.io.IOException)8 TokensAnnotation (edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation)7 CorefChain (edu.stanford.nlp.coref.data.CorefChain)6 EntityMentionsAnnotation (edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations.EntityMentionsAnnotation)6 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)6