use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class EREDocumentReader method createAndAddXmlMarkupAnnotations.
/**
* create a view with constituents representing post boundaries and quotations.
* For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
* and attributes NAME_START and NAME_END specify the name offsets in the original xml text
*
* @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
*/
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
TextAnnotation ta = xmlTa.getTextAnnotation();
View postView = new View(getPostViewName(), NAME, ta, 1.0);
for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
String label = spanInfo.label;
Pair<String, IntPair> authorInfo = null;
boolean isPost = false;
if (POST.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(AUTHOR);
} else if (QUOTE.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
}
if (isPost) {
IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
if (null != authorInfo) {
c.addAttribute(AUTHOR, authorInfo.getFirst());
c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
postView.addConstituent(c);
}
}
}
if (!postView.getConstituents().isEmpty())
ta.addView(getPostViewName(), postView);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class PennTreebankPOSReader method createTextAnnotation.
/**
* Create a new {@link TextAnnotation} from a single line of bracketed text
*
* @param line The bracketed string to be processed
* @param lineId The ID of the {@link TextAnnotation}
* @return A {@link TextAnnotation} with a populated {@link ViewNames#POS} view
*/
public TextAnnotation createTextAnnotation(String line, String lineId) {
String[] wordPOSPairs = splitWordsPattern.split(line.substring(1, line.length() - 1));
List<String> words = new ArrayList<>(wordPOSPairs.length);
List<String> pos = new ArrayList<>(wordPOSPairs.length);
for (String wordPOSPair : wordPOSPairs) {
String[] split = whitespacePattern.split(wordPOSPair);
words.add(split[1]);
pos.add(split[0]);
}
List<String[]> tokenizedSentences = Collections.singletonList(words.toArray(new String[words.size()]));
TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(corpusName, lineId, tokenizedSentences);
TokenLabelView posView = new TokenLabelView(ViewNames.POS, ta);
for (int i = 0; i < pos.size(); i++) posView.addTokenLabel(i, pos.get(i), 1.0);
ta.addView(ViewNames.POS, posView);
return ta;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class TreebankChunkReader method next.
@Override
public TextAnnotation next() {
TextAnnotation textAnnotation = super.next();
// int currentTree = this.treeInFile - 1;
int currentSection = this.currentSectionId - 1;
int currentFile = this.currentFileId - 1;
if (chunkLines == null || currentChunkLineId == chunkLines.size()) {
try {
chunkLines = LineIO.read(chunkHome + "/" + sections[currentSection] + "/" + currentSectionFiles[currentFile]);
currentChunkLineId = 0;
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
return addChunkAnnotation(textAnnotation, currentChunkLineId);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class ConvertEREToCoNLLFormat method main.
/**
* @param args command line arguments: corpus directory, include Nominals or not, and output
* directory.
* @throws Exception
*/
public static void main(String[] args) throws Exception {
if (args.length != 5) {
System.err.println("Usage: " + NAME + " ERECorpusVal corpusRoot includeNominals<true|false> outDir\n\nSee " + "module README or ERECorpusReader.EreCorpus enumeration for possible values.");
System.exit(-1);
}
final String ereCorpusVal = args[0];
final String corpusRoot = args[1];
final boolean includeNominals = Boolean.parseBoolean(args[2]);
final String conllDir = args[3];
if (IOUtils.exists(conllDir))
if (!IOUtils.isDirectory(conllDir)) {
System.err.println("Output directory '" + conllDir + "' exists and is not a directory.");
System.exit(-1);
} else
IOUtils.mkdir(conllDir);
boolean throwExceptionOnXmlTagMismatch = true;
ERENerReader reader = new ERENerReader(EreCorpus.valueOf(ereCorpusVal), corpusRoot, throwExceptionOnXmlTagMismatch, includeNominals, includeNominals);
while (reader.hasNext()) {
XmlTextAnnotation xmlTa = reader.next();
TextAnnotation ta = xmlTa.getTextAnnotation();
View nerView = ta.getView(reader.getMentionViewName());
CoNLL2002Writer.writeViewInCoNLL2003Format(nerView, ta, conllDir + "/" + ta.getCorpusId() + ".txt");
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class JsonSerializerTest method testJsonSerializedTaUpdate.
/**
* make sure that if an already serialized TextAnnotation object is modified and reserialized,
* (and written to the same target file), that the file is updated correctly
*/
@Test
public void testJsonSerializedTaUpdate() {
// make sure we aren't using a TA already updated with "rhyme" view
TextAnnotation localTa = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(new String[] { ViewNames.POS, ViewNames.NER_CONLL, ViewNames.SRL_VERB }, false, // no noise
3);
String serTestDir = "serTestDir";
if (!IOUtils.exists(serTestDir))
IOUtils.mkdir(serTestDir);
else if (IOUtils.isFile(serTestDir))
throw new IllegalStateException("ERROR: test directory " + serTestDir + " already exists as file.");
else
try {
IOUtils.cleanDir(serTestDir);
} catch (IOException e) {
e.printStackTrace();
throw new IllegalStateException("ERROR: test directory " + serTestDir + " could not be cleaned. Permissions?");
}
if (!IOUtils.getListOfFilesInDir(serTestDir).isEmpty())
throw new IllegalStateException("ERROR: test directory " + serTestDir + " already contains files even after cleaning.");
String fileName = serTestDir + "/arbitrary.json";
boolean forceOverwrite = true;
boolean useJson = true;
try {
SerializationHelper.serializeTextAnnotationToFile(localTa, fileName, forceOverwrite, useJson);
} catch (IOException e) {
e.printStackTrace();
fail("error trying to serialize json file " + fileName + ".");
}
TextAnnotation taDeser = null;
try {
taDeser = SerializationHelper.deserializeTextAnnotationFromFile(fileName, useJson);
} catch (Exception e) {
e.printStackTrace();
fail("error trying to deserialize json file " + fileName + ".");
}
assertTrue(taDeser.hasView(ViewNames.SRL_VERB));
assertFalse(taDeser.hasView(RHYME_VIEW_NAME));
addRhymeViewToTa(taDeser);
assertTrue(taDeser.hasView(RHYME_VIEW_NAME));
try {
SerializationHelper.serializeTextAnnotationToFile(taDeser, fileName, forceOverwrite, useJson);
} catch (IOException e) {
e.printStackTrace();
fail("error trying to serialize json file " + fileName + " for second time.");
}
TextAnnotation taDeserDeser = null;
try {
taDeserDeser = SerializationHelper.deserializeTextAnnotationFromFile(fileName, useJson);
} catch (Exception e) {
e.printStackTrace();
fail("error trying to deserialize json file " + fileName + " for second time.");
}
assertTrue(taDeserDeser.hasView(RHYME_VIEW_NAME));
assertTrue(taDeserDeser.getView(RHYME_VIEW_NAME).getConstituents().size() > 0);
}
Aggregations