use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class EREDocumentReader method createAndAddXmlMarkupAnnotations.
/**
* create a view with constituents representing post boundaries and quotations.
* For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
* and attributes NAME_START and NAME_END specify the name offsets in the original xml text
*
* @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
*/
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
TextAnnotation ta = xmlTa.getTextAnnotation();
View postView = new View(getPostViewName(), NAME, ta, 1.0);
for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
String label = spanInfo.label;
Pair<String, IntPair> authorInfo = null;
boolean isPost = false;
if (POST.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(AUTHOR);
} else if (QUOTE.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
}
if (isPost) {
IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
if (null != authorInfo) {
c.addAttribute(AUTHOR, authorInfo.getFirst());
c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
postView.addConstituent(c);
}
}
}
if (!postView.getConstituents().isEmpty())
ta.addView(getPostViewName(), postView);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class SimpleGazetteerAnnotator method addView.
/**
* The view will consist of potentially overlapping constituents representing those tokens that
* matched entries in the gazetteers. Some tokens will match against several gazetteers.
*/
@Override
public void addView(TextAnnotation ta) throws AnnotatorException {
View view = ta.getView(ViewNames.TOKENS);
List<Constituent> constituents = view.getConstituents();
SpanLabelView slv = new SpanLabelView(this.getViewName(), this.getClass().getName(), ta, 1d, true);
for (int constindx = 0; constindx < constituents.size(); constindx++) {
for (int dictindx = 0; dictindx < dictionaries.size(); dictindx++) {
dictionaries.get(dictindx).match(constituents, constindx, slv);
dictionariesIgnoreCase.get(dictindx).match(constituents, constindx, slv);
}
}
ta.addView(slv.getViewName(), slv);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class POSWindowTwo method getFeatures.
@Override
public /**
* This feature extractor assumes that the TOKEN View, POS View have been
* generated in the Constituents TextAnnotation. It will use its own POS tag of the
* two context words before and after the constituent.
*
**/
Set<Feature> getFeatures(Constituent c) throws EdisonException {
TextAnnotation ta = c.getTextAnnotation();
View TOKENS = null, POS = null;
try {
TOKENS = ta.getView(ViewNames.TOKENS);
POS = ta.getView(ViewNames.POS);
} catch (Exception e) {
e.printStackTrace();
}
// We can assume that the constituent in this case is a Word(Token) described by the LBJ
// chunk definition
int startspan = c.getStartSpan();
int endspan = c.getEndSpan();
// All our constituents are words(tokens)
// words two before & after
int k = 2;
String[] tags = getwindowtagskfrom(TOKENS, POS, startspan, endspan, k);
String classifier = "POSWindowTwo";
String __id, __value;
Set<Feature> __result = new LinkedHashSet<Feature>();
for (int i = 0; i < tags.length; i++) {
if (tags[i] == null) {
continue;
} else {
__id = classifier + ":" + i;
__value = "(" + tags[i] + ")";
logger.info(__id + __value);
__result.add(new DiscreteFeature(__id + __value));
}
}
return __result;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class TestSrlNomIdentifier method test.
/**
* Only in and out relations in the SRL_VERB view are used for the purpose of testing.
*/
public final void test() throws Exception {
logger.info("Nom_Identifier Feature Extractor");
String[] viewsToAdd = { ViewNames.POS, ViewNames.LEMMA, ViewNames.SHALLOW_PARSE, ViewNames.PARSE_GOLD, ViewNames.SRL_VERB, ViewNames.PARSE_STANFORD, ViewNames.NER_CONLL };
TextAnnotation ta = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(viewsToAdd, true, 3);
ta.addView(ClauseViewGenerator.STANFORD);
ta.addView(PseudoParse.STANFORD);
logger.info("This textannotation annotates the text: \n" + ta.getText());
View SRL_VERB = ta.getView("SRL_VERB");
List<Constituent> testlist = SRL_VERB.getConstituentsCoveringSpan(10, 13);
testlist.addAll(SRL_VERB.getConstituentsCoveringSpan(26, 27));
FeatureManifest featureManifest;
FeatureExtractor fex;
String fileName = Constant.prefix + "/Nom/Identifier/nom-identifier.fex";
featureManifest = new FeatureManifest(new FileInputStream(fileName));
FeatureManifest.setFeatureExtractor("hyphen-argument-feature", FeatureGenerators.hyphenTagFeature);
FeatureManifest.setTransformer("parse-left-sibling", FeatureGenerators.getParseLeftSibling(ViewNames.PARSE_STANFORD));
FeatureManifest.setTransformer("parse-right-sibling", FeatureGenerators.getParseRightSibling(ViewNames.PARSE_STANFORD));
FeatureManifest.setFeatureExtractor("pp-features", FeatureGenerators.ppFeatures(ViewNames.PARSE_STANFORD));
FeatureManifest.setFeatureExtractor("projected-path", new ProjectedPath(ViewNames.PARSE_STANFORD));
featureManifest.useCompressedName();
featureManifest.setVariable("*default-parser*", ViewNames.PARSE_STANFORD);
fex = featureManifest.createFex();
SrlNomIdentifier ni = new SrlNomIdentifier();
for (Constituent test : testlist) {
assertTrue(SRLFeaturesComparator.isEqual(test, fex, ni));
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.View in project cogcomp-nlp by CogComp.
the class TestCorlex method test.
public final void test() throws EdisonException {
log.debug("Corlex Feature Extractor");
// Using the first TA and a constituent between span of 30-40 as a test
TextAnnotation ta = tas.get(1);
View TOKENS = ta.getView("TOKENS");
log.debug("Got tokens FROM TextAnnotation");
CorelexFeatureExtractor testInstance = new CorelexFeatureExtractor(true);
Set<Feature> feats = testInstance.getWordFeatures(ta, 1);
String[] expected_outputs = { "atr" };
if (feats == null) {
log.debug("Feats are returning NULL.");
}
log.debug("Printing Set of Features");
for (Feature f : feats) {
log.debug(f.getName());
assertTrue(ArrayUtils.contains(expected_outputs, f.getName()));
}
}
Aggregations