Search in sources :

Example 26 with TextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.

the class EREDocumentReader method createAndAddXmlMarkupAnnotations.

/**
 * create a view with constituents representing post boundaries and quotations.
 * For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
 *    and attributes NAME_START and NAME_END specify the name offsets in the original xml text
 *
 * @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
 */
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
    List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
    TextAnnotation ta = xmlTa.getTextAnnotation();
    View postView = new View(getPostViewName(), NAME, ta, 1.0);
    for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
        String label = spanInfo.label;
        Pair<String, IntPair> authorInfo = null;
        boolean isPost = false;
        if (POST.equals(label)) {
            isPost = true;
            authorInfo = spanInfo.attributes.get(AUTHOR);
        } else if (QUOTE.equals(label)) {
            isPost = true;
            authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
        }
        if (isPost) {
            IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
            if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
                throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
            int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
            int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
            assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
            Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
            if (null != authorInfo) {
                c.addAttribute(AUTHOR, authorInfo.getFirst());
                c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
                c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
                postView.addConstituent(c);
            }
        }
    }
    if (!postView.getConstituents().isEmpty())
        ta.addView(getPostViewName(), postView);
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 27 with TextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.

the class PennTreebankPOSReader method createTextAnnotation.

/**
 * Create a new {@link TextAnnotation} from a single line of bracketed text
 *
 * @param line The bracketed string to be processed
 * @param lineId The ID of the {@link TextAnnotation}
 * @return A {@link TextAnnotation} with a populated {@link ViewNames#POS} view
 */
public TextAnnotation createTextAnnotation(String line, String lineId) {
    String[] wordPOSPairs = splitWordsPattern.split(line.substring(1, line.length() - 1));
    List<String> words = new ArrayList<>(wordPOSPairs.length);
    List<String> pos = new ArrayList<>(wordPOSPairs.length);
    for (String wordPOSPair : wordPOSPairs) {
        String[] split = whitespacePattern.split(wordPOSPair);
        words.add(split[1]);
        pos.add(split[0]);
    }
    List<String[]> tokenizedSentences = Collections.singletonList(words.toArray(new String[words.size()]));
    TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(corpusName, lineId, tokenizedSentences);
    TokenLabelView posView = new TokenLabelView(ViewNames.POS, ta);
    for (int i = 0; i < pos.size(); i++) posView.addTokenLabel(i, pos.get(i), 1.0);
    ta.addView(ViewNames.POS, posView);
    return ta;
}
Also used : ArrayList(java.util.ArrayList) TokenLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TokenLabelView) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 28 with TextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.

the class TreebankChunkReader method next.

@Override
public TextAnnotation next() {
    TextAnnotation textAnnotation = super.next();
    // int currentTree = this.treeInFile - 1;
    int currentSection = this.currentSectionId - 1;
    int currentFile = this.currentFileId - 1;
    if (chunkLines == null || currentChunkLineId == chunkLines.size()) {
        try {
            chunkLines = LineIO.read(chunkHome + "/" + sections[currentSection] + "/" + currentSectionFiles[currentFile]);
            currentChunkLineId = 0;
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    }
    return addChunkAnnotation(textAnnotation, currentChunkLineId);
}
Also used : FileNotFoundException(java.io.FileNotFoundException) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 29 with TextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.

the class ConvertEREToCoNLLFormat method main.

/**
 * @param args command line arguments: corpus directory, include Nominals or not, and output
 *        directory.
 * @throws Exception
 */
public static void main(String[] args) throws Exception {
    if (args.length != 5) {
        System.err.println("Usage: " + NAME + " ERECorpusVal corpusRoot includeNominals<true|false> outDir\n\nSee " + "module README or ERECorpusReader.EreCorpus enumeration for possible values.");
        System.exit(-1);
    }
    final String ereCorpusVal = args[0];
    final String corpusRoot = args[1];
    final boolean includeNominals = Boolean.parseBoolean(args[2]);
    final String conllDir = args[3];
    if (IOUtils.exists(conllDir))
        if (!IOUtils.isDirectory(conllDir)) {
            System.err.println("Output directory '" + conllDir + "' exists and is not a directory.");
            System.exit(-1);
        } else
            IOUtils.mkdir(conllDir);
    boolean throwExceptionOnXmlTagMismatch = true;
    ERENerReader reader = new ERENerReader(EreCorpus.valueOf(ereCorpusVal), corpusRoot, throwExceptionOnXmlTagMismatch, includeNominals, includeNominals);
    while (reader.hasNext()) {
        XmlTextAnnotation xmlTa = reader.next();
        TextAnnotation ta = xmlTa.getTextAnnotation();
        View nerView = ta.getView(reader.getMentionViewName());
        CoNLL2002Writer.writeViewInCoNLL2003Format(nerView, ta, conllDir + "/" + ta.getCorpusId() + ".txt");
    }
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)

Example 30 with TextAnnotation

use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.

the class JsonSerializerTest method testJsonSerializedTaUpdate.

/**
 * make sure that if an already serialized TextAnnotation object is modified and reserialized,
 *    (and written to the same target file), that the file is updated correctly
 */
@Test
public void testJsonSerializedTaUpdate() {
    // make sure we aren't using a TA already updated with "rhyme" view
    TextAnnotation localTa = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(new String[] { ViewNames.POS, ViewNames.NER_CONLL, ViewNames.SRL_VERB }, false, // no noise
    3);
    String serTestDir = "serTestDir";
    if (!IOUtils.exists(serTestDir))
        IOUtils.mkdir(serTestDir);
    else if (IOUtils.isFile(serTestDir))
        throw new IllegalStateException("ERROR: test directory " + serTestDir + " already exists as file.");
    else
        try {
            IOUtils.cleanDir(serTestDir);
        } catch (IOException e) {
            e.printStackTrace();
            throw new IllegalStateException("ERROR: test directory " + serTestDir + " could not be cleaned. Permissions?");
        }
    if (!IOUtils.getListOfFilesInDir(serTestDir).isEmpty())
        throw new IllegalStateException("ERROR: test directory " + serTestDir + " already contains files even after cleaning.");
    String fileName = serTestDir + "/arbitrary.json";
    boolean forceOverwrite = true;
    boolean useJson = true;
    try {
        SerializationHelper.serializeTextAnnotationToFile(localTa, fileName, forceOverwrite, useJson);
    } catch (IOException e) {
        e.printStackTrace();
        fail("error trying to serialize json file " + fileName + ".");
    }
    TextAnnotation taDeser = null;
    try {
        taDeser = SerializationHelper.deserializeTextAnnotationFromFile(fileName, useJson);
    } catch (Exception e) {
        e.printStackTrace();
        fail("error trying to deserialize json file " + fileName + ".");
    }
    assertTrue(taDeser.hasView(ViewNames.SRL_VERB));
    assertFalse(taDeser.hasView(RHYME_VIEW_NAME));
    addRhymeViewToTa(taDeser);
    assertTrue(taDeser.hasView(RHYME_VIEW_NAME));
    try {
        SerializationHelper.serializeTextAnnotationToFile(taDeser, fileName, forceOverwrite, useJson);
    } catch (IOException e) {
        e.printStackTrace();
        fail("error trying to serialize json file " + fileName + " for second time.");
    }
    TextAnnotation taDeserDeser = null;
    try {
        taDeserDeser = SerializationHelper.deserializeTextAnnotationFromFile(fileName, useJson);
    } catch (Exception e) {
        e.printStackTrace();
        fail("error trying to deserialize json file " + fileName + " for second time.");
    }
    assertTrue(taDeserDeser.hasView(RHYME_VIEW_NAME));
    assertTrue(taDeserDeser.getView(RHYME_VIEW_NAME).getConstituents().size() > 0);
}
Also used : IOException(java.io.IOException) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IOException(java.io.IOException) Test(org.junit.Test)

Aggregations

TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)292 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)121 Test (org.junit.Test)84 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)60 Feature (edu.illinois.cs.cogcomp.edison.features.Feature)48 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)40 ArrayList (java.util.ArrayList)33 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)32 DiscreteFeature (edu.illinois.cs.cogcomp.edison.features.DiscreteFeature)28 TreeView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TreeView)27 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)24 EdisonException (edu.illinois.cs.cogcomp.edison.utilities.EdisonException)22 IOException (java.io.IOException)22 LinkedHashSet (java.util.LinkedHashSet)21 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)20 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)19 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)18 Relation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Relation)18 File (java.io.File)18 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)16