Search in sources :

Example 11 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class OntonotesNamedEntityReader method nextAnnotation.

/**
 * parse the pen treebank parse file, producing an annotation covering the entire file.
 * @param data the data from the file, each line.
 * @param docid the id representing the document name.
 * @return the text annotation.
 * @throws AnnotatorException
 */
private XmlTextAnnotation nextAnnotation(String data, String docid) throws AnnotatorException {
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid);
    TextAnnotation ta = xta.getTextAnnotation();
    List<SpanInfo> fudge = xta.getXmlMarkup();
    // create the named entity vi
    View nerView = new SpanLabelView(VIEW_NAME, ta);
    for (SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            Pair<String, IntPair> neLabelPair = si.attributes.get("type");
            String neLabel = neLabelPair.getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            if (cleanTextNeTokStart == -1 || cleanTextNeTokEnd == -1) {
                for (Constituent c : nerView.getConstituents()) {
                    System.err.println(c);
                }
                System.err.println("Something wonky in \"" + docid + "\", at " + charOffsets + ", " + cleanTextCharStart + " - " + cleanTextCharEnd + " = " + ta.text.substring(cleanTextCharStart, cleanTextCharEnd));
            } else {
                if (entityCounts.containsKey(neLabel)) {
                    entityCounts.put(neLabel, (entityCounts.get(neLabel) + 1));
                } else {
                    entityCounts.put(neLabel, 1);
                }
                // constituent token indexing uses one-past-the-end
                Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
                nerView.addConstituent(neCon);
            }
        }
    }
    ta.addView(VIEW_NAME, nerView);
    return xta;
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) SpanInfo(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 12 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class OntonotesCorefReader method parseLines.

/**
 * Lines have no meaning, since the input data is XML. We will construct an
 * input buffer, replacing each line with a "\n" so we can use an XML parser
 * to produce the data.
 */
@Override
protected TextAnnotation parseLines(ArrayList<String> lines) throws AnnotatorException {
    // get the treebank pase data
    if (!this.otr.hasNext())
        throw new RuntimeException("There were not as many treebank files as there were coref files.");
    // get the treebank parse using the ontonotes treebank reader.
    TextAnnotation resultTA = this.otr.next();
    if (resultTA == null)
        return null;
    View nerView = null;
    String[] nerTokens = null;
    TextAnnotation nerTA = null;
    // get the named entity data.
    if (!this.oner.hasNext()) {
        // no NER, return.
        return null;
    } else {
        // All this code is just to get the named entity
        XmlTextAnnotation xmlta = this.oner.next();
        if (xmlta != null) {
            nerTA = xmlta.getTextAnnotation();
            if (nerTA == null) {
                logger.error("There was no NER text annotation in \"" + this.oner.currentfile + "\"");
                return null;
            } else {
                nerView = nerTA.getView(OntonotesNamedEntityReader.VIEW_NAME);
                if (nerView == null) {
                    logger.error("There was no NER view in \"" + this.oner.currentfile + "\"");
                    return null;
                }
                nerTokens = nerTA.getTokens();
            }
        } else {
            // the file did nto exist.
            return null;
        }
    }
    // nothing to work on, just return.
    if (lines.size() == 0)
        return null;
    // construct a single string
    StringBuffer sb = new StringBuffer(lines.get(0));
    for (int i = 1; i < lines.size(); i++) {
        sb.append(" ");
        sb.append(lines.get(i));
    }
    // produce a document object.
    String text = sb.toString();
    Document doc = null;
    try {
        doc = SimpleXMLParser.getDocument(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8.name())));
    } catch (UnsupportedEncodingException | XMLException e) {
        throw new AnnotatorException("Could not decode the text from the XML document.");
    }
    if (doc == null) {
        throw new AnnotatorException("Could not decode the text from the XML document.");
    }
    // Get a list of coref mentions object wrappers, these contain all the info we need
    // to construct the coref chains.
    ArrayList<CorefMention> hits = new ArrayList<>();
    traverse(resultTA, 0, hits, doc.getDocumentElement(), "");
    // we have all the hits, organize them into referant chains.
    HashMap<String, ArrayList<CorefMention>> chains = new HashMap<>();
    for (CorefMention cm : hits) {
        ArrayList<CorefMention> chain = chains.get(cm.id);
        if (chain == null) {
            chain = new ArrayList<CorefMention>();
            chains.put(cm.id, chain);
        }
        chain.add(cm);
    }
    CoreferenceView corefView = new CoreferenceView(VIEW_NAME, VIEW_NAME, resultTA, 0.0);
    for (Entry<String, ArrayList<CorefMention>> entry : chains.entrySet()) {
        ArrayList<CorefMention> mentions = entry.getValue();
        CorefMention head = mentions.get(0);
        Constituent headconst = new Constituent(head.id, VIEW_NAME, resultTA, head.location.getFirst(), head.location.getSecond());
        head.constituent = headconst;
        // These are added by the addCorefEdges call. corefView.addConstituent(headconst);
        if (debug)
            System.out.println(head + " -> " + head.constituent.getSurfaceForm());
        ArrayList<Constituent> referants = new ArrayList<>();
        ArrayList<HashMap<String, String>> attributes = new ArrayList<>();
        for (int i = 1; i < mentions.size(); i++) {
            CorefMention cm = mentions.get(i);
            Constituent constituent = new Constituent(cm.id, VIEW_NAME, resultTA, cm.location.getFirst(), cm.location.getSecond());
            cm.constituent = constituent;
            // These are added by the addCorefEdges call. corefView.addConstituent(constituent);
            referants.add(constituent);
            // set up the attributes for the relation, just type and subtype.
            HashMap<String, String> attribute = new HashMap<>();
            if (cm.type != null)
                attribute.put("TYPE", cm.type);
            if (cm.subtype != null)
                attribute.put("SUBTYPE", cm.subtype);
            if (cm.speaker != null)
                attribute.put("SPEAKER", cm.speaker);
            attributes.add(attribute);
            logger.debug("    " + cm + " -> " + cm.constituent.getSurfaceForm());
        }
        corefView.addCorefEdges(headconst, referants, attributes);
    }
    // now for each constituent in our view, determine what type of mention it is.
    // Here we will project the named entities from the ".name" file onto this annotation
    // and create a named entity view
    String[] coreftokens = resultTA.getTokens();
    // allign token offsets, for each coref token, the corresponding offset of the START of the ner token.
    // this aligns token offsets.
    int[] tokenAlignment = new int[nerTokens.length];
    for (int ci = 0, ni = 0; ci < coreftokens.length && ni < nerTokens.length; ) {
        tokenAlignment[ni] = ci;
        if (coreftokens[ci].equals(nerTokens[ni])) {
            ni++;
            ci++;
        } else {
            // our tokens didn't align. Some symbols are treated differently
            // so where we see "&", "$", "-" and so on, there maybe different
            // tokenizations, so try to append successive ner tokens to see if they
            // then match.
            String ctok = coreftokens[ci];
            String ntok = nerTokens[ni];
            int niplus = 0;
            // as long as the appended ner token contains the coref token, possible match
            while (true) {
                if (ctok.equals(ntok)) {
                    break;
                } else {
                    if (ctok.contains(ntok)) {
                        niplus++;
                        ntok += nerTokens[ni + niplus];
                    } else if (ntok.contains("-")) {
                        // check for XML escapes.
                        if (this.compareWithXMLEscapesIgnoreGarbageIn(ctok, ntok)) {
                            break;
                        } else {
                            niplus++;
                            if (ni + niplus >= nerTokens.length)
                                // give up.
                                break;
                            ntok += nerTokens[ni + niplus];
                        }
                    } else {
                        StringBuffer stringbuffer = new StringBuffer("\nTokens were simply different in " + this.currentfile + " around " + ni + " and " + ci + "\n");
                        for (int cci = ci - 10, i = 0; i < 30; i++, cci++) {
                            stringbuffer.append(" " + coreftokens[cci]);
                        }
                        stringbuffer.append('\n');
                        for (int nni = ni - 8, i = 0; i < 30; i++, nni++) {
                            if (nni == ni)
                                stringbuffer.append(" *");
                            else
                                stringbuffer.append(" ");
                            stringbuffer.append(nerTokens[nni]);
                        }
                        stringbuffer.append('\n');
                        logger.error(stringbuffer.toString());
                        return null;
                    }
                }
            }
            if (ctok.equals(ntok) || this.compareWithXMLEscapes(ctok, ntok)) {
                // we matched
                ni += niplus;
                ni++;
                ci++;
            } else {
                StringBuffer stringbuffer = new StringBuffer("\nNo alignment in " + this.currentfile + " around " + ni + " and " + ci);
                for (int cci = ci - 10, i = 0; i < 30; i++, cci++) {
                    stringbuffer.append(" " + coreftokens[cci]);
                }
                stringbuffer.append('\n');
                for (int nni = ni - 8, i = 0; i < 30; i++, nni++) {
                    if (nni == ni)
                        stringbuffer.append(" *");
                    else
                        stringbuffer.append(" ");
                    stringbuffer.append(nerTokens[nni]);
                }
                stringbuffer.append('\n');
                logger.error(stringbuffer.toString());
                return null;
            }
        }
    }
    // now transpose the NER view to the coref tokenization.
    SpanLabelView tv = new SpanLabelView(OntonotesNamedEntityReader.VIEW_NAME, this.getClass().getCanonicalName(), resultTA, 1.0, true);
    for (Constituent c : nerView.getConstituents()) {
        int start = tokenAlignment[c.getStartSpan()];
        int end = c.getEndSpan() >= tokenAlignment.length ? tokenAlignment[tokenAlignment.length - 1] : tokenAlignment[c.getEndSpan()];
        try {
            String lbl = c.getLabel();
            tv.addSpanLabel(start, end, lbl, c.getConstituentScore());
        } catch (IllegalArgumentException iae) {
            logger.error("Overlapping labels are not supported.", iae);
        }
    }
    if (resultTA != null) {
        resultTA.addView(OntonotesCorefReader.VIEW_NAME, corefView);
        resultTA.addView(OntonotesNamedEntityReader.VIEW_NAME, tv);
        View posView = (View) resultTA.getView(OntonotesTreebankReader.VIEW_NAME);
        // new identify mention types.
        for (Constituent c : corefView.getConstituents()) {
            this.setMentionType(c, tv, posView);
        }
        processed++;
    }
    return resultTA;
}
Also used : CoreferenceView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.CoreferenceView) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Document(org.w3c.dom.Document) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) CoreferenceView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.CoreferenceView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) XMLException(edu.illinois.cs.cogcomp.nlp.corpusreaders.aceReader.XMLException) ByteArrayInputStream(java.io.ByteArrayInputStream)

Example 13 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class TACReaderTest method main.

public static void main(String[] args) {
    TACReader tacReader = null;
    try {
        tacReader = new TACReader(CORPUS_ROOT, true);
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("ERROR: " + NAME + ": couldn't instantiate TACReader: " + e.getMessage());
    }
    String wantedId = "ENG_NW_001278_20130318_F00012HTB.xml";
    XmlTextAnnotation outputXmlTa = null;
    do {
        try {
            outputXmlTa = tacReader.next();
        } catch (IllegalStateException e) {
            e.printStackTrace();
        }
    } while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && tacReader.hasNext());
    if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
        fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
    TextAnnotation output = outputXmlTa.getTextAnnotation();
    StringTransformation xmlSt = outputXmlTa.getXmlSt();
    String origXml = xmlSt.getOrigText();
    List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
    Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
    Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
    Set<String> docIdReported = markupAttributes.get(IDOFFSETS);
    assert (docIdReported.contains(ID));
    assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
    assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Set(java.util.Set) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)

Example 14 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMakerOntonotesTest method testNestedNames.

/**
 * the edit offsets get messed up when there are nested tags.
 */
@Test
public void testNestedNames() {
    String text = "He spoke with Paul <ENAMEX TYPE=\"PERSON\"><ENAMEX TYPE=\"PERSON\" E_OFF=\"1\">Paula</ENAMEX> Zahn</ENAMEX> .";
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(text, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
    StringTransformation xst = xta.getXmlSt();
    for (XmlDocumentProcessor.SpanInfo si : fudge) {
        int newTextStart = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getFirst());
        int newTextEnd = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getSecond());
        String neStr = ta.getText().substring(newTextStart, newTextEnd);
        assertTrue(REF_ENTITIES.contains(neStr));
    }
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Test(org.junit.Test)

Aggregations

XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)14 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)13 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)6 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)5 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)5 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)4 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)3 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)3 EREMentionRelationReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)3 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 SpanInfo (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Test (org.junit.Test)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)1 CoreferenceView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.CoreferenceView)1 Counter (edu.illinois.cs.cogcomp.core.stats.Counter)1 TextCleanerStringTransformation (edu.illinois.cs.cogcomp.core.utilities.TextCleanerStringTransformation)1