use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMakerTest method testWithFile.
private static void testWithFile(XmlTextAnnotationMaker maker, String xmlFile) {
String xmlStr = null;
try {
xmlStr = LineIO.slurp(xmlFile);
} catch (FileNotFoundException e) {
e.printStackTrace();
System.exit(-1);
}
XmlTextAnnotation output = maker.createTextAnnotation(xmlStr, "test", "test");
TextAnnotation ta = output.getTextAnnotation();
Sentence firstSentence = ta.getSentence(0);
String firstSentenceText = firstSentence.getText();
System.out.println(firstSentenceText);
Constituent thirdWord = ta.getView(ViewNames.TOKENS).getConstituentsCoveringSpan(2, 3).get(0);
int thirdStartChar = thirdWord.getStartCharOffset();
int thirdEndChar = thirdWord.getEndCharOffset();
String thirdWordForm = thirdWord.getSurfaceForm();
StringTransformation st = output.getXmlSt();
IntPair origSpan = st.getOriginalOffsets(thirdStartChar, thirdEndChar);
// int origStartChar = st.computeOriginalOffset(thirdStartChar);
// int origEndChar = st.computeOriginalOffset(thirdEndChar);
// String origWordForm = xmlStr.substring(origStartChar, origEndChar);
String origWordForm = st.getOrigText().substring(origSpan.getFirst(), origSpan.getSecond());
System.out.println("Third word: " + thirdWordForm);
String transformStr = st.getTransformedText().substring(thirdStartChar, thirdEndChar);
System.out.println("corresponding substring from transformed text: " + transformStr);
System.out.println("original text substring using mapped offsets: " + origWordForm);
if (!transformStr.equals(origWordForm))
System.err.println("ERROR: test failed: word '" + transformStr + "' not identical to original word '" + origWordForm + "'. ");
View mentionView = output.getTextAnnotation().getView(ViewNames.SENTENCE);
for (Constituent c : mentionView.getConstituents()) {
int start = c.getStartCharOffset();
int end = c.getEndCharOffset();
String cleanForm = c.getSurfaceForm();
IntPair sourceSpan = st.getOriginalOffsets(start, end);
System.out.println("------\nclean: " + cleanForm + ", (" + start + ", " + end + ")");
System.out.println("------\nsource: " + st.getOrigText().substring(sourceSpan.getFirst(), sourceSpan.getSecond()) + ", (" + sourceSpan.getFirst() + ", " + sourceSpan.getSecond() + ")\n");
}
List<XmlDocumentProcessor.SpanInfo> markup = output.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupMap = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
for (IntPair offsets : markupMap.keySet()) {
System.out.print(offsets.getFirst() + "-" + offsets.getSecond() + ": ");
Map<String, Pair<String, IntPair>> attVals = markupMap.get(offsets).attributes;
for (String attType : attVals.keySet()) System.out.println(attType + ": " + attVals.get(attType).getFirst());
System.out.println();
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class HeadFinderDependencyViewGenerator method getDependencyTree.
public static TreeView getDependencyTree(TextAnnotation input, String parseViewName, String dependencyViewName) {
CollinsHeadDependencyParser depParser = new CollinsHeadDependencyParser(false);
TreeView parseTreeView = (TreeView) input.getView(parseViewName);
TreeView depTreeView = new TreeView(dependencyViewName, viewGenerator, input, 1d);
int size = 0;
for (int i = 0; i < input.getNumberOfSentences(); i++) {
if (parseTreeView.getTree(i) != null) {
Constituent parseTreeRoot = parseTreeView.getRootConstituent(i);
Tree<Pair<String, Integer>> labeledDependencyTree = depParser.getLabeledDependencyTree(parseTreeRoot);
try {
depTreeView.setDependencyTree(i, labeledDependencyTree);
} catch (IllegalStateException e) {
System.err.println(parseTreeView);
System.err.println("Unlabeled dependency tree (for debugging): ");
System.err.println(depParser.getDependencyTree(parseTreeRoot));
throw e;
}
size += input.getSentence(i).size();
int nConstituents = depTreeView.getNumberOfConstituents();
if (nConstituents != size) {
logger.error("{} nodes in dependency tree, " + "{} tokens in text so far", nConstituents, size);
Set<Integer> set = new LinkedHashSet<>();
for (int tokenId = 0; tokenId < size; tokenId++) {
set.add(tokenId);
}
for (Constituent c : depTreeView.getConstituents()) {
set.remove(c.getStartSpan());
}
StringBuilder sb = new StringBuilder();
for (int tokenId : set) {
sb.append(input.getToken(tokenId)).append(" ");
}
logger.error("Dependency tree does not cover tokens: {}", sb.toString());
}
}
}
return depTreeView;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class NombankFields method createPredicate.
@Override
public Constituent createPredicate(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield) {
Tree<Pair<String, IntPair>> l = yield.get(predicateTerminal);
int start = l.getLabel().getSecond().getFirst();
Constituent predicate = new Constituent("Predicate", viewName, ta, start, start + 1);
predicate.addAttribute(PropbankReader.LemmaIdentifier, lemma);
predicate.addAttribute(PropbankReader.SenseIdentifier, sense);
return predicate;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class PropbankFields method createPredicate.
public Constituent createPredicate(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield) {
Tree<Pair<String, IntPair>> l = yield.get(predicateTerminal);
int start = l.getLabel().getSecond().getFirst();
Constituent predicate = new Constituent("Predicate", viewName, ta, start, start + 1);
predicate.addAttribute(PropbankReader.LemmaIdentifier, lemma);
predicate.addAttribute(PropbankReader.SenseIdentifier, sense);
predicate.addAttribute(PropbankReader.FormIdentifier, PropbankReader.Forms.getForm(inflection.charAt(0)).name());
predicate.addAttribute(PropbankReader.TenseIdentifier, PropbankReader.Tenses.getTense(inflection.charAt(1)).name());
predicate.addAttribute(PropbankReader.AspectIdentifier, PropbankReader.Aspects.getAspect(inflection.charAt(2)).name());
predicate.addAttribute(PropbankReader.PersonIdentifier, PropbankReader.Person.getPerson(inflection.charAt(3)).name());
predicate.addAttribute(PropbankReader.VoiceIdentifier, PropbankReader.Voices.getVoice(inflection.charAt(4)).name());
predicate.addAttribute(PropbankReader.Tagger, tagger);
return predicate;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Pair in project cogcomp-nlp by CogComp.
the class ACE_BN_Reader method parse.
public static Pair<List<Pair<String, Paragraph>>, Map<String, String>> parse(String content, String contentRemovingTags, boolean is2004) {
List<Pair<String, Paragraph>> paragraphs = new ArrayList<>();
Map<String, String> metadata = new HashMap<>();
Pattern pattern = null;
Matcher matcher = null;
String docID = "";
String dateTime = "";
String headLine = "";
String text = "";
pattern = is2004 ? Pattern.compile("<DOCNO>(.*?)</DOCNO>") : Pattern.compile("<DOCID>(.*?)</DOCID>");
matcher = pattern.matcher(content);
while (matcher.find()) {
docID = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentID, docID);
pattern = is2004 ? Pattern.compile("<DATE_TIME>(.*?)</DATE_TIME>") : Pattern.compile("<DATETIME>(.*?)</DATETIME>");
matcher = pattern.matcher(content);
while (matcher.find()) {
dateTime = (matcher.group(1)).trim();
}
metadata.put(DocumentMetadata.DocumentCreationTime, dateTime);
if (is2004) {
pattern = Pattern.compile("<TEXT>(.*?)<TURN>|<TURN>(.*?)<TURN>|<TURN>(.*?)</TEXT>|<TEXT>(.*?)</TEXT>");
} else {
pattern = Pattern.compile("<TURN>(.*?)</TURN>");
}
matcher = pattern.matcher(content);
int regionStart = 0;
while (matcher.find(regionStart)) {
// Pick the first non-empty group.
for (int i = 1; i <= matcher.groupCount(); ++i) {
if (matcher.group(i) != null) {
text = (matcher.group(i)).trim();
break;
}
}
int index4 = content.indexOf(text);
Paragraph para4 = new Paragraph(index4, text);
Pair<String, Paragraph> pair4 = new Pair<String, Paragraph>("text", para4);
paragraphs.add(pair4);
if (is2004) {
// Hack to move back to the overlapping <TURN> tag
regionStart = matcher.end() - 6;
} else {
regionStart = matcher.end();
}
}
int index = 0;
for (int i = 0; i < paragraphs.size(); ++i) {
String paraContent = paragraphs.get(i).getSecond().content;
int offsetWithFiltering = contentRemovingTags.indexOf(paraContent, index);
paragraphs.get(i).getSecond().offsetFilterTags = offsetWithFiltering;
index += paraContent.length();
}
if (isDebug) {
for (int i = 0; i < paragraphs.size(); ++i) {
logger.info(paragraphs.get(i).getFirst() + "--> " + paragraphs.get(i).getSecond().content);
logger.info(content.substring(paragraphs.get(i).getSecond().offset, paragraphs.get(i).getSecond().offset + paragraphs.get(i).getSecond().content.length()));
logger.info(contentRemovingTags.substring(paragraphs.get(i).getSecond().offsetFilterTags, paragraphs.get(i).getSecond().offsetFilterTags + paragraphs.get(i).getSecond().content.length()));
logger.info("\n");
}
}
return new Pair<>(paragraphs, metadata);
}
Aggregations