Examples with XmlTextAnnotation - edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation

Example 1 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class BIOTester method statistics.

public static void statistics() {
    int ace_nam = 0;
    int ace_nom = 0;
    int ace_pro = 0;
    int ere_nam = 0;
    int ere_nom = 0;
    int ere_pro = 0;
    int tac_nam = 0;
    int tac_nom = 0;
    try {
        ACEReaderWithTrueCaseFixer aceReader = new ACEReaderWithTrueCaseFixer("data/all", false);
        for (TextAnnotation ta : aceReader) {
            for (Constituent c : ta.getView(ViewNames.MENTION_ACE)) {
                if (c.getAttribute("EntityMentionType").equals("NAM")) {
                    ace_nam++;
                }
                if (c.getAttribute("EntityMentionType").equals("NOM")) {
                    ace_nom++;
                }
                if (c.getAttribute("EntityMentionType").equals("PRO")) {
                    ace_pro++;
                }
            }
        }
        EREMentionRelationReader ereReader = new EREMentionRelationReader(EREDocumentReader.EreCorpus.ENR3, "data/ere/data", false);
        for (XmlTextAnnotation xta : ereReader) {
            TextAnnotation ta = xta.getTextAnnotation();
            for (Constituent c : ta.getView(ViewNames.MENTION_ERE)) {
                if (c.getAttribute("EntityMentionType").equals("NAM")) {
                    ere_nam++;
                }
                if (c.getAttribute("EntityMentionType").equals("NOM")) {
                    ere_nom++;
                }
                if (c.getAttribute("EntityMentionType").equals("PRO")) {
                    ere_pro++;
                }
            }
        }
        ColumnFormatReader columnFormatReader = new ColumnFormatReader("data/tac/2016.nam");
        for (TextAnnotation ta : columnFormatReader) {
            for (Constituent c : ta.getView("MENTIONS")) {
                tac_nam++;
            }
        }
        columnFormatReader = new ColumnFormatReader("data/tac/2016.nom");
        for (TextAnnotation ta : columnFormatReader) {
            for (Constituent c : ta.getView("MENTIONS")) {
                tac_nom++;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    System.out.println("ACE_NAM: " + ace_nam);
    System.out.println("ACE_NOM: " + ace_nom);
    System.out.println("ACE_PRO: " + ace_pro);
    System.out.println("ERE_NAM: " + ere_nam);
    System.out.println("ERE_NOM: " + ere_nom);
    System.out.println("ERE_PRO: " + ere_pro);
    System.out.println("TAC_NAM: " + tac_nam);
    System.out.println("TAC_NOM: " + tac_nom);
}

Also used : EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) ACEReaderWithTrueCaseFixer(edu.illinois.cs.cogcomp.nlp.corpusreaders.ACEReaderWithTrueCaseFixer) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) DatastoreException(org.cogcomp.DatastoreException) JWNLException(net.didion.jwnl.JWNLException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) IOException(java.io.IOException) InvalidPortException(io.minio.errors.InvalidPortException)

Example 2 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class EREDocumentReader method createAndAddXmlMarkupAnnotations.

/**
 * create a view with constituents representing post boundaries and quotations.
 * For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
 *    and attributes NAME_START and NAME_END specify the name offsets in the original xml text
 *
 * @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
 */
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
    List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
    TextAnnotation ta = xmlTa.getTextAnnotation();
    View postView = new View(getPostViewName(), NAME, ta, 1.0);
    for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
        String label = spanInfo.label;
        Pair<String, IntPair> authorInfo = null;
        boolean isPost = false;
        if (POST.equals(label)) {
            isPost = true;
            authorInfo = spanInfo.attributes.get(AUTHOR);
        } else if (QUOTE.equals(label)) {
            isPost = true;
            authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
        }
        if (isPost) {
            IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
            if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
                throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
            int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
            int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
            assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
            Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
            if (null != authorInfo) {
                c.addAttribute(AUTHOR, authorInfo.getFirst());
                c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
                c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
                postView.addConstituent(c);
            }
        }
    }
    if (!postView.getConstituents().isEmpty())
        ta.addView(getPostViewName(), postView);
}

Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 3 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class ConvertEREToCoNLLFormat method main.

/**
 * @param args command line arguments: corpus directory, include Nominals or not, and output
 *        directory.
 * @throws Exception
 */
public static void main(String[] args) throws Exception {
    if (args.length != 5) {
        System.err.println("Usage: " + NAME + " ERECorpusVal corpusRoot includeNominals<true|false> outDir\n\nSee " + "module README or ERECorpusReader.EreCorpus enumeration for possible values.");
        System.exit(-1);
    }
    final String ereCorpusVal = args[0];
    final String corpusRoot = args[1];
    final boolean includeNominals = Boolean.parseBoolean(args[2]);
    final String conllDir = args[3];
    if (IOUtils.exists(conllDir))
        if (!IOUtils.isDirectory(conllDir)) {
            System.err.println("Output directory '" + conllDir + "' exists and is not a directory.");
            System.exit(-1);
        } else
            IOUtils.mkdir(conllDir);
    boolean throwExceptionOnXmlTagMismatch = true;
    ERENerReader reader = new ERENerReader(EreCorpus.valueOf(ereCorpusVal), corpusRoot, throwExceptionOnXmlTagMismatch, includeNominals, includeNominals);
    while (reader.hasNext()) {
        XmlTextAnnotation xmlTa = reader.next();
        TextAnnotation ta = xmlTa.getTextAnnotation();
        View nerView = ta.getView(reader.getMentionViewName());
        CoNLL2002Writer.writeViewInCoNLL2003Format(nerView, ta, conllDir + "/" + ta.getCorpusId() + ".txt");
    }
}

Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)

Example 4 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class CreateTrainDevTestSplit method main.

/**
 * split an ERE corpus with 0.7/0.1/0.2 train/dev/test proportions, trying to balance
 *    all (or at least, lowest frequency) type count.
 *
 * @param args
 */
public static void main(String[] args) {
    if (args.length != 3) {
        System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
        System.exit(-1);
    }
    EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
    String corpusRoot = args[1];
    String outDir = args[2];
    ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
    boolean throwExceptionOnXmlParserFail = false;
    double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
    double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
    double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
    // Path corpusPath = Paths.get(corpusRoot);
    // String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
    IOUtils.mkdir(outDir);
    String outFileStem = outDir + "/";
    // {ViewNames.EVENT_ERE};
    String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
    String[] labelsToCount = {};
    EREMentionRelationReader reader = null;
    try {
        reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
    Map<String, Set<View>> ereViews = new HashMap<>();
    while (reader.hasNext()) {
        XmlTextAnnotation xmlTextAnnotation = reader.next();
        ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
        Set<View> views = new HashSet<>();
        TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
        for (String viewName : viewNames) if (ta.hasView(viewName))
            views.add(ta.getView(viewName));
        ereViews.put(ta.getId(), views);
    }
    TextAnnotationLabelCounter lce = new TextAnnotationLabelCounter(labelsToCount.length == 0, labelsToCount, ereViews);
    CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(lce);
    Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
    Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
    Map<String, Counter<String>> counts = creator.getExampleLabelCounts();
    List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
    for (String docId : counts.keySet()) {
        outLines.add(docId + ": " + printCounts(counts.get(docId)));
    }
    for (Split s : splitCounts.keySet()) {
        outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
    }
    Counter<String> totalLabelCounts = creator.getLabelTotals();
    outLines.add("TOTALS: " + printCounts(totalLabelCounts));
    try {
        LineIO.write(outFileStem + "countInfo.txt", outLines);
    } catch (IOException e) {
        e.printStackTrace();
        System.exit(-1);
    }
    for (Split s : splits.keySet()) {
        List<String> ids = new ArrayList<>(splits.get(s));
        try {
            LineIO.write(outFileStem + s.name() + ".txt", ids);
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(-1);
        }
    }
}

Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) EREDocumentReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader) Counter(edu.illinois.cs.cogcomp.core.stats.Counter) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) IOException(java.io.IOException) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IOException(java.io.IOException) EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) EREEventReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader)

Example 5 with XmlTextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.

the class XmlDocumentReader method getAnnotationsFromFile.

/**
 * given an entry from the corpus file list generated by {@link #getFileListing()} , parse its
 * contents and get zero or more TextAnnotation objects. This allows for the case where corpus
 * annotations are provided in standoff format in one or more files separate from the source
 * document.  In such cases, the first file in the list should contain the source document
 * and the rest should be the corresponding markup files.
 *
 * In this default implementation, it is assumed that a single file contains both source and markup.
 *
 * @param corpusFileListEntry a list of files, the first of which is a source file.
 * @return List of TextAnnotation objects extracted from the corpus file.
 */
@Override
public List<XmlTextAnnotation> getAnnotationsFromFile(List<Path> corpusFileListEntry) throws Exception {
    Path sourceTextAndAnnotationFile = corpusFileListEntry.get(0);
    fileId = sourceTextAndAnnotationFile.getName(sourceTextAndAnnotationFile.getNameCount() - 1).toString();
    logger.debug("read source file {}", fileId);
    numFiles++;
    String fileText = LineIO.slurp(sourceTextAndAnnotationFile.toString());
    List<XmlTextAnnotation> xmlTaList = new ArrayList<>(1);
    XmlTextAnnotation xmlTa = xmlTextAnnotationMaker.createTextAnnotation(fileText, this.corpusName, fileId);
    if (null != xmlTa) {
        xmlTaList.add(xmlTa);
        numTextAnnotations++;
    }
    return xmlTaList;
}

Also used : Path(java.nio.file.Path) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) ArrayList(java.util.ArrayList)

Aggregations

XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)14 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)13 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)6 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)5 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)5 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)4 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)3 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)3 EREMentionRelationReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)3 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 SpanInfo (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor.SpanInfo)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Test (org.junit.Test)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)1 CoreferenceView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.CoreferenceView)1 Counter (edu.illinois.cs.cogcomp.core.stats.Counter)1 TextCleanerStringTransformation (edu.illinois.cs.cogcomp.core.utilities.TextCleanerStringTransformation)1