use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class MascReader method main.
/**
* Read sections of corpus into TextAnnotations, write out TextAnnotations in json format.
* Specify MASC root dir of written files, e.g. /home/mssammon/work/data/masc-ccg/written/
* @param args
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage: " + NAME + " mascCorpusDir outDir");
System.exit(-1);
}
String corpusDir = args[0];
String outDirGold = args[1];
String outDirPred = outDirGold + "_PRED";
Properties props = new Properties();
props.setProperty(CorpusReaderConfigurator.CORPUS_DIRECTORY.key, corpusDir);
props.setProperty(CorpusReaderConfigurator.SOURCE_DIRECTORY.key, corpusDir);
IOUtils.mkdir(outDirGold);
IOUtils.mkdir(outDirPred);
ResourceManager rm = new ResourceManager(props);
MascReader reader = null;
try {
reader = new MascReader(rm);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
int numGoldTokCorrect = 0;
int numGoldTokTotal = 0;
int numGoldSentCorrect = 0;
int numGoldSentTotal = 0;
while (reader.hasNext()) {
TextAnnotation goldTa = reader.next();
String text = goldTa.getText();
// Tokenizer.Tokenization tknz = tokenizer.tokenizeTextSpan(text);
TextAnnotation predTa = taBldr.createTextAnnotation(goldTa.getCorpusId() + "_PREDICTED", goldTa.getId(), text);
IntPair[] goldTokCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.TOKENS));
numGoldTokTotal += goldTokCharOffsets.length;
numGoldTokCorrect += countCorrectSpans(predTa.getView(ViewNames.TOKENS), goldTokCharOffsets);
IntPair[] goldSentCharOffsets = getCharacterOffsets(goldTa.getView(ViewNames.SENTENCE));
numGoldSentTotal += goldSentCharOffsets.length;
numGoldSentCorrect += countCorrectSpans(predTa.getView(ViewNames.SENTENCE), goldSentCharOffsets);
String taJson = SerializationHelper.serializeToJson(goldTa, true);
String outFile = Paths.get(outDirGold, goldTa.getId() + ".json").toString();
try {
logger.trace("Writing file out to '{}'...", outFile);
LineIO.write(outFile, Collections.singletonList(taJson));
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
outFile = Paths.get(outDirPred, predTa.getId() + ".json").toString();
String predTaJson = SerializationHelper.serializeToJson(predTa, true);
try {
logger.debug("writing file '{}'...", outFile);
LineIO.write(outFile, Collections.singletonList(predTaJson));
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
logger.debug("## finished processing file '{}'.", goldTa.getId());
}
System.out.println(reader.generateReport());
System.out.print("TOKEN PERFORMANCE:");
computeAndPrintAcc(numGoldTokCorrect, numGoldTokTotal);
System.out.print("SENTENCE PERFORMANCE:");
computeAndPrintAcc(numGoldSentCorrect, numGoldSentTotal);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class MascReader method removeOverlappingSentences.
/**
* This method may be redundant at this point
* @param sentences
*/
private void removeOverlappingSentences(List<SentenceStaxParser.MascSentence> sentences) {
Set<IntPair> offsetsToRemove = new HashSet<>();
Map<IntPair, SentenceStaxParser.MascSentence> offsetsToSentences = new HashMap<>();
Set<SentenceStaxParser.MascSentence> sentsToRemove = new HashSet<>();
for (SentenceStaxParser.MascSentence sent : sentences) {
IntPair sentOffset = new IntPair(sent.start, sent.end);
for (IntPair offset : offsetsToSentences.keySet()) {
if (isInside(sentOffset, offset)) {
sentsToRemove.add(sent);
break;
} else if (isInside(offset, sentOffset))
offsetsToRemove.add(offset);
else if (isOverlap(offset, sentOffset)) {
if (isLarger(offset, sentOffset)) {
sentsToRemove.add(sent);
break;
} else
offsetsToRemove.add(offset);
}
}
}
logger.debug("## removing at least {}, and at most {}, sentences...", sentsToRemove.size(), (sentsToRemove.size() + offsetsToRemove.size()));
for (SentenceStaxParser.MascSentence sent : sentsToRemove) {
sentences.remove(sent);
}
for (IntPair offset : offsetsToRemove) sentences.remove(offsetsToSentences.get(offset));
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class SRLNode method compileLinks.
/**
* compile a list of predicates.
* @param tree the tree with the data.
* @param tokenmap the token maps with the terminal nodes.
* @return the string indicating the leaves.
*/
public ArrayList<PredicateArgument> compileLinks(HashMap<Integer, Tree<Constituent>> tokenmap) {
ArrayList<PredicateArgument> map = new ArrayList<PredicateArgument>();
// then compile all the tokens in that subtree.
for (SRLLink link : links) {
Tree<Constituent> node = tokenmap.get(link.where.getFirst());
for (int i = link.where.getSecond(); i > 0 && node.getParent() != null; i--) {
Tree<Constituent> up = node.getParent();
node = up;
}
Constituent constituent = node.getLabel();
map.add(new PredicateArgument(link.argument, link.link, new IntPair(constituent.getStartSpan(), constituent.getEndSpan())));
}
return map;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class PARC3Reader method getAnnotationsFromFile.
/**
* Parse a document into an {@link edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation
* TextAnnotation}. By default TOKEN and SENTENCE view will be populated. Other gold views will only be
* populated if set in configurations
*
* @param list a list of containing one path to a xml document
* @return a list containing one TextAnnotation, corresponding to one source text file plus
* annotations
* @throws Exception if files can't be found, or if parser fails to read annotation format
*/
@Override
public List<TextAnnotation> getAnnotationsFromFile(List<Path> list) throws Exception {
List<TextAnnotation> result = new ArrayList<>();
for (Path p : list) {
String fileStem = IOUtils.getFileStem(p.toFile().getName());
logger.info("Processing: {}", fileStem);
// Tokens, Sentences, POS and Lemma
List<String> tokens = new ArrayList<>();
List<IntPair> charOffsets = new ArrayList<>();
List<Integer> sentTokOffset = new ArrayList<>();
List<String> POStags = new ArrayList<>();
List<String> lemmas = new ArrayList<>();
// Attribution Relations - each entry in the map corresponds to one set of attribution relation
Map<String, AttributionRelation> attrRelations = new HashMap<>();
// Text
StringBuilder text = new StringBuilder();
int lastWordEndByteOffset = 0;
int tokenIdx = 0;
Document doc = XMLUtils.getXMLDOM(p.toString());
// Optional, we don't actually need this, as of now.
doc.getDocumentElement().normalize();
NodeList sentences = doc.getElementsByTagName(NODE_SENTENCE);
for (int sid = 0; sid < sentences.getLength(); sid++) {
Element sent = (Element) sentences.item(sid);
NodeList words = sent.getElementsByTagName(NODE_WORD);
for (int wid = 0; wid < words.getLength(); wid++) {
Element word = (Element) words.item(wid);
NodeList attrRels = word.getElementsByTagName(NODE_ATTRIBUTION);
for (int aid = 0; aid < attrRels.getLength(); aid++) {
Element attrRel = (Element) attrRels.item(aid);
String relationId = attrRel.getAttribute(ATTR_RELATION_ID);
// Get attribution role(s) for current token
NodeList attrRoles = attrRel.getElementsByTagName(NODE_ATTRIBUTION_ROLE);
for (int arid = 0; arid < attrRoles.getLength(); arid++) {
Element attrRole = (Element) attrRoles.item(arid);
String role = attrRole.getAttribute(ATTR_ROLE_VALUE);
updateAttributionRelation(attrRelations, relationId, role, tokenIdx);
}
}
String wordText = word.getAttribute(ATTR_WORD_TEXT);
String pos = word.getAttribute(ATTR_POS);
String lem = word.getAttribute(ATTR_LEM);
String[] byteOffsetStr = word.getAttribute(ATTR_BYTE_COUNT).split(",");
IntPair oracleByteOffset = new IntPair(Integer.parseInt(byteOffsetStr[0]), // This is byte offset according to PARC, which is not accurate
Integer.parseInt(byteOffsetStr[1]));
// fill whitespace and update current word to text
int numWhiteSpace = oracleByteOffset.getFirst() - lastWordEndByteOffset;
text.append(String.join("", Collections.nCopies(numWhiteSpace, " ")));
int startCharOffset = text.length();
text.append(wordText);
int endCharOffset = text.length();
lastWordEndByteOffset = oracleByteOffset.getSecond();
// Update token and token offset
tokens.add(wordText);
charOffsets.add(new IntPair(startCharOffset, endCharOffset));
tokenIdx++;
// Update sentence token offset
if (wid == words.getLength() - 1)
sentTokOffset.add(tokenIdx);
// Update POS tags
POStags.add(pos);
lemmas.add(lem);
}
}
TextAnnotation ta = new TextAnnotation(super.corpusName, fileStem, text.toString(), charOffsets.toArray(new IntPair[0]), tokens.toArray(new String[0]), sentTokOffset.stream().mapToInt(i -> i).toArray());
if (bPopulatePOS)
populatePOS(ta, POStags);
if (bPopulateLemma)
populateLemma(ta, lemmas);
populateAttribution(ta, attrRelations);
result.add(ta);
}
return result;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ConvertOntonotesToColumn method getNameTextAnnotation.
/**
* read the file indicated by the argument which is the file name, and path.
* @param file the file to read.
* @param document the data read from the file.
* @return the XmlTextAnnotation containing the text annotation, and xml markup offset data.
* @throws IOException
*/
private static XmlTextAnnotation getNameTextAnnotation(File file) throws IOException {
String document = LineIO.slurp(file.getCanonicalPath());
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<SpanInfo> fudge = xta.getXmlMarkup();
// create the named entity vi
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
for (SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
}
ta.addView(ViewNames.NER_ONTONOTES, nerView);
return xta;
}
Aggregations