use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class BIOTester method statistics.
public static void statistics() {
int ace_nam = 0;
int ace_nom = 0;
int ace_pro = 0;
int ere_nam = 0;
int ere_nom = 0;
int ere_pro = 0;
int tac_nam = 0;
int tac_nom = 0;
try {
ACEReaderWithTrueCaseFixer aceReader = new ACEReaderWithTrueCaseFixer("data/all", false);
for (TextAnnotation ta : aceReader) {
for (Constituent c : ta.getView(ViewNames.MENTION_ACE)) {
if (c.getAttribute("EntityMentionType").equals("NAM")) {
ace_nam++;
}
if (c.getAttribute("EntityMentionType").equals("NOM")) {
ace_nom++;
}
if (c.getAttribute("EntityMentionType").equals("PRO")) {
ace_pro++;
}
}
}
EREMentionRelationReader ereReader = new EREMentionRelationReader(EREDocumentReader.EreCorpus.ENR3, "data/ere/data", false);
for (XmlTextAnnotation xta : ereReader) {
TextAnnotation ta = xta.getTextAnnotation();
for (Constituent c : ta.getView(ViewNames.MENTION_ERE)) {
if (c.getAttribute("EntityMentionType").equals("NAM")) {
ere_nam++;
}
if (c.getAttribute("EntityMentionType").equals("NOM")) {
ere_nom++;
}
if (c.getAttribute("EntityMentionType").equals("PRO")) {
ere_pro++;
}
}
}
ColumnFormatReader columnFormatReader = new ColumnFormatReader("data/tac/2016.nam");
for (TextAnnotation ta : columnFormatReader) {
for (Constituent c : ta.getView("MENTIONS")) {
tac_nam++;
}
}
columnFormatReader = new ColumnFormatReader("data/tac/2016.nom");
for (TextAnnotation ta : columnFormatReader) {
for (Constituent c : ta.getView("MENTIONS")) {
tac_nom++;
}
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("ACE_NAM: " + ace_nam);
System.out.println("ACE_NOM: " + ace_nom);
System.out.println("ACE_PRO: " + ace_pro);
System.out.println("ERE_NAM: " + ere_nam);
System.out.println("ERE_NOM: " + ere_nom);
System.out.println("ERE_PRO: " + ere_pro);
System.out.println("TAC_NAM: " + tac_nam);
System.out.println("TAC_NOM: " + tac_nom);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class EREDocumentReader method createAndAddXmlMarkupAnnotations.
/**
* create a view with constituents representing post boundaries and quotations.
* For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
* and attributes NAME_START and NAME_END specify the name offsets in the original xml text
*
* @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
*/
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
TextAnnotation ta = xmlTa.getTextAnnotation();
View postView = new View(getPostViewName(), NAME, ta, 1.0);
for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
String label = spanInfo.label;
Pair<String, IntPair> authorInfo = null;
boolean isPost = false;
if (POST.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(AUTHOR);
} else if (QUOTE.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
}
if (isPost) {
IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
if (null != authorInfo) {
c.addAttribute(AUTHOR, authorInfo.getFirst());
c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
postView.addConstituent(c);
}
}
}
if (!postView.getConstituents().isEmpty())
ta.addView(getPostViewName(), postView);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class ConvertEREToCoNLLFormat method main.
/**
* @param args command line arguments: corpus directory, include Nominals or not, and output
* directory.
* @throws Exception
*/
public static void main(String[] args) throws Exception {
if (args.length != 5) {
System.err.println("Usage: " + NAME + " ERECorpusVal corpusRoot includeNominals<true|false> outDir\n\nSee " + "module README or ERECorpusReader.EreCorpus enumeration for possible values.");
System.exit(-1);
}
final String ereCorpusVal = args[0];
final String corpusRoot = args[1];
final boolean includeNominals = Boolean.parseBoolean(args[2]);
final String conllDir = args[3];
if (IOUtils.exists(conllDir))
if (!IOUtils.isDirectory(conllDir)) {
System.err.println("Output directory '" + conllDir + "' exists and is not a directory.");
System.exit(-1);
} else
IOUtils.mkdir(conllDir);
boolean throwExceptionOnXmlTagMismatch = true;
ERENerReader reader = new ERENerReader(EreCorpus.valueOf(ereCorpusVal), corpusRoot, throwExceptionOnXmlTagMismatch, includeNominals, includeNominals);
while (reader.hasNext()) {
XmlTextAnnotation xmlTa = reader.next();
TextAnnotation ta = xmlTa.getTextAnnotation();
View nerView = ta.getView(reader.getMentionViewName());
CoNLL2002Writer.writeViewInCoNLL2003Format(nerView, ta, conllDir + "/" + ta.getCorpusId() + ".txt");
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class CreateTrainDevTestSplit method main.
/**
* split an ERE corpus with 0.7/0.1/0.2 train/dev/test proportions, trying to balance
* all (or at least, lowest frequency) type count.
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
System.exit(-1);
}
EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
String corpusRoot = args[1];
String outDir = args[2];
ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
boolean throwExceptionOnXmlParserFail = false;
double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
// Path corpusPath = Paths.get(corpusRoot);
// String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
IOUtils.mkdir(outDir);
String outFileStem = outDir + "/";
// {ViewNames.EVENT_ERE};
String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
String[] labelsToCount = {};
EREMentionRelationReader reader = null;
try {
reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
Map<String, Set<View>> ereViews = new HashMap<>();
while (reader.hasNext()) {
XmlTextAnnotation xmlTextAnnotation = reader.next();
ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
Set<View> views = new HashSet<>();
TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
for (String viewName : viewNames) if (ta.hasView(viewName))
views.add(ta.getView(viewName));
ereViews.put(ta.getId(), views);
}
TextAnnotationLabelCounter lce = new TextAnnotationLabelCounter(labelsToCount.length == 0, labelsToCount, ereViews);
CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(lce);
Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
Map<String, Counter<String>> counts = creator.getExampleLabelCounts();
List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
for (String docId : counts.keySet()) {
outLines.add(docId + ": " + printCounts(counts.get(docId)));
}
for (Split s : splitCounts.keySet()) {
outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
}
Counter<String> totalLabelCounts = creator.getLabelTotals();
outLines.add("TOTALS: " + printCounts(totalLabelCounts));
try {
LineIO.write(outFileStem + "countInfo.txt", outLines);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
for (Split s : splits.keySet()) {
List<String> ids = new ArrayList<>(splits.get(s));
try {
LineIO.write(outFileStem + s.name() + ".txt", ids);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation in project cogcomp-nlp by CogComp.
the class XmlDocumentReader method getAnnotationsFromFile.
/**
* given an entry from the corpus file list generated by {@link #getFileListing()} , parse its
* contents and get zero or more TextAnnotation objects. This allows for the case where corpus
* annotations are provided in standoff format in one or more files separate from the source
* document. In such cases, the first file in the list should contain the source document
* and the rest should be the corresponding markup files.
*
* In this default implementation, it is assumed that a single file contains both source and markup.
*
* @param corpusFileListEntry a list of files, the first of which is a source file.
* @return List of TextAnnotation objects extracted from the corpus file.
*/
@Override
public List<XmlTextAnnotation> getAnnotationsFromFile(List<Path> corpusFileListEntry) throws Exception {
Path sourceTextAndAnnotationFile = corpusFileListEntry.get(0);
fileId = sourceTextAndAnnotationFile.getName(sourceTextAndAnnotationFile.getNameCount() - 1).toString();
logger.debug("read source file {}", fileId);
numFiles++;
String fileText = LineIO.slurp(sourceTextAndAnnotationFile.toString());
List<XmlTextAnnotation> xmlTaList = new ArrayList<>(1);
XmlTextAnnotation xmlTa = xmlTextAnnotationMaker.createTextAnnotation(fileText, this.corpusName, fileId);
if (null != xmlTa) {
xmlTaList.add(xmlTa);
numTextAnnotations++;
}
return xmlTaList;
}
Aggregations