Search in sources :

Example 1 with SpanInfo

use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo in project cogcomp-nlp by CogComp.

the class ConvertOntonotesToColumn method getNameTextAnnotation.

/**
 * read the file indicated by the argument which is the file name, and path.
 * @param file the file to read.
 * @param document the data read from the file.
 * @return the XmlTextAnnotation containing the text annotation, and xml markup offset data.
 * @throws IOException
 */
private static XmlTextAnnotation getNameTextAnnotation(File file) throws IOException {
    String document = LineIO.slurp(file.getCanonicalPath());
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<SpanInfo> fudge = xta.getXmlMarkup();
    // create the named entity vi
    View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
    for (SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            String neLabel = si.attributes.get("type").getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            // constituent token indexing uses one-past-the-end
            Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
            nerView.addConstituent(neCon);
        }
    }
    ta.addView(ViewNames.NER_ONTONOTES, nerView);
    return xta;
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) SpanInfo(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 2 with SpanInfo

use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo in project cogcomp-nlp by CogComp.

the class OntonotesNamedEntityReader method nextAnnotation.

/**
 * parse the pen treebank parse file, producing an annotation covering the entire file.
 * @param data the data from the file, each line.
 * @param docid the id representing the document name.
 * @return the text annotation.
 * @throws AnnotatorException
 */
private XmlTextAnnotation nextAnnotation(String data, String docid) throws AnnotatorException {
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid);
    TextAnnotation ta = xta.getTextAnnotation();
    List<SpanInfo> fudge = xta.getXmlMarkup();
    // create the named entity vi
    View nerView = new SpanLabelView(VIEW_NAME, ta);
    for (SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            Pair<String, IntPair> neLabelPair = si.attributes.get("type");
            String neLabel = neLabelPair.getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            if (cleanTextNeTokStart == -1 || cleanTextNeTokEnd == -1) {
                for (Constituent c : nerView.getConstituents()) {
                    System.err.println(c);
                }
                System.err.println("Something wonky in \"" + docid + "\", at " + charOffsets + ", " + cleanTextCharStart + " - " + cleanTextCharEnd + " = " + ta.text.substring(cleanTextCharStart, cleanTextCharEnd));
            } else {
                if (entityCounts.containsKey(neLabel)) {
                    entityCounts.put(neLabel, (entityCounts.get(neLabel) + 1));
                } else {
                    entityCounts.put(neLabel, 1);
                }
                // constituent token indexing uses one-past-the-end
                Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
                nerView.addConstituent(neCon);
            }
        }
    }
    ta.addView(VIEW_NAME, nerView);
    return xta;
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) SpanInfo(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Aggregations

XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)2 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)2 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)2 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)2 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)2 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)2 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)2 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)2 SpanInfo (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo)2 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)2 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)2