Search in sources :

Example 6 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class OntonotesNamedEntityReader method main.

/**
 * This class will read the ontonotes data from the provided directory, and write the resulting
 * NER view data to the specified output directory in CoNLL column format. It will retain
 * the directory structure of the original data.
 * @param args command lines args specify input data directory, language and output directory.
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    if (args.length < 3) {
        System.err.println("This executable requires three arguments:\n" + " OntonotesTreebankReader <OntoNotes Directory> <language> <output_directory>");
        System.exit(-1);
    }
    String topdir = args[0];
    String outputdir = args[2];
    OntonotesNamedEntityReader otr = new OntonotesNamedEntityReader(topdir, args[1]);
    int count = 0;
    final boolean producejson = true;
    while (otr.hasNext()) {
        XmlTextAnnotation xta = otr.next();
        String path = otr.currentfile;
        if (producejson) {
            try {
                String json = SerializationHelper.serializeToJson(xta.getTextAnnotation());
                String outfile = otr.currentfile.replace(topdir, args[2]);
                File outputfile = new File(outfile);
                outputfile.getParentFile().mkdirs();
                try (PrintWriter out = new PrintWriter(outputfile)) {
                    out.print(json);
                }
            } catch (Throwable t) {
                System.out.println(otr.currentfile + " produced the incorrect offset.");
            }
        } else {
            TextAnnotation ta = xta.getTextAnnotation();
            path = outputdir + path.substring(topdir.length());
            path += ".conll";
            CoNLL2002Writer.writeViewInCoNLL2003Format(ta.getView(VIEW_NAME), ta, path);
        }
        count++;
        if ((count % 10) == 0)
            System.out.println("Completed " + count + " of " + otr.filelist.size());
    }
    System.out.println(otr.generateReport());
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 7 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class ConvertOntonotesToColumn method getNameTextAnnotation.

/**
 * read the file indicated by the argument which is the file name, and path.
 * @param file the file to read.
 * @param document the data read from the file.
 * @return the XmlTextAnnotation containing the text annotation, and xml markup offset data.
 * @throws IOException
 */
private static XmlTextAnnotation getNameTextAnnotation(File file) throws IOException {
    String document = LineIO.slurp(file.getCanonicalPath());
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<SpanInfo> fudge = xta.getXmlMarkup();
    // create the named entity vi
    View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
    for (SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            String neLabel = si.attributes.get("type").getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            // constituent token indexing uses one-past-the-end
            Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
            nerView.addConstituent(neCon);
        }
    }
    ta.addView(ViewNames.NER_ONTONOTES, nerView);
    return xta;
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) SpanInfo(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 8 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class MultilingualEreReaderTest method testReader.

private static void testReader(EREEventReader reader) {
    assertTrue(reader.hasNext());
    XmlTextAnnotation xmlTa = reader.next();
    TextAnnotation ta = xmlTa.getTextAnnotation();
    assertTrue(ta.hasView(ViewNames.MENTION_ERE));
    assertTrue(ta.getView(ViewNames.MENTION_ERE).getConstituents().size() > 5);
    assertTrue(ta.getView(ViewNames.MENTION_ERE).getRelations().size() > 0);
    assertTrue(ta.hasView(ViewNames.EVENT_ERE));
    assertTrue(ta.getView(ViewNames.EVENT_ERE).getConstituents().size() > 1);
    assertTrue(ta.getView(ViewNames.EVENT_ERE).getRelations().size() > 2);
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)

Example 9 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class MentionDetectionTest method testHeadInference.

@Test
public void testHeadInference() {
    EREMentionRelationReader ereMentionRelationReader = null;
    try {
        String path = "src/test/resources/ERE";
        ereMentionRelationReader = new EREMentionRelationReader(EREDocumentReader.EreCorpus.ENR3, path, false);
        POSAnnotator posAnnotator = new POSAnnotator();
        MentionAnnotator mentionAnnotator = new MentionAnnotator();
        for (XmlTextAnnotation xta : ereMentionRelationReader) {
            TextAnnotation ta = xta.getTextAnnotation();
            ta.addView(posAnnotator);
            mentionAnnotator.addView(ta);
            if (ta.getView("MENTION").getNumberOfConstituents() < 60) {
                fail("Mention Head predicted performance dropped");
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) MentionAnnotator(org.cogcomp.md.MentionAnnotator) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Test(org.junit.Test)

Example 10 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMaker method createTextAnnotation.

/**
 * A method for creating
 * {@link TextAnnotation} by
 * tokenizing the given text string.
 *
 * @param xmlText Raw xml text from corpus document
 * @param corpusId corpus identifier
 * @param docId text identifier
 * @return an XmlTextAnnotation with the cleaned text (StringTransformation), TextAnnotation for
 *          the cleaned text, and xml markup extracted from source
 */
public XmlTextAnnotation createTextAnnotation(String xmlText, String corpusId, String docId) {
    logger.debug("processing text from document {}", docId);
    Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> cleanResults = xmlProcessor.processXml(xmlText);
    TextAnnotation ta = taBuilder.createTextAnnotation(corpusId, docId, cleanResults.getFirst().getTransformedText());
    return new XmlTextAnnotation(cleanResults.getFirst(), ta, cleanResults.getSecond());
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) TextCleanerStringTransformation(edu.illinois.cs.cogcomp.core.utilities.TextCleanerStringTransformation) List(java.util.List) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)

Aggregations

XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)14 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)13 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)6 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)5 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)5 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)4 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)3 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)3 EREMentionRelationReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)3 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 SpanInfo (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Test (org.junit.Test)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)1 CoreferenceView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.CoreferenceView)1 Counter (edu.illinois.cs.cogcomp.core.stats.Counter)1 TextCleanerStringTransformation (edu.illinois.cs.cogcomp.core.utilities.TextCleanerStringTransformation)1