use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class ParseHelper method getTokenIndexedCleanedParseTreeNodeCovering.
public static Tree<Pair<String, IntPair>> getTokenIndexedCleanedParseTreeNodeCovering(Constituent c, String parseViewName) {
TextAnnotation ta = c.getTextAnnotation();
Tree<String> tree = getParseTree(parseViewName, ta, ta.getSentenceId(c));
int start = c.getStartSpan();
int end = c.getEndSpan();
tree = ParseUtils.snipNullNodes(tree);
tree = ParseUtils.stripFunctionTags(tree);
tree = ParseUtils.stripIndexReferences(tree);
return getTokenIndexedTreeCovering(tree, start, end);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class ParsePath method getFeatures.
@Override
public Set<Feature> getFeatures(Constituent c) throws EdisonException {
TextAnnotation ta = c.getTextAnnotation();
TreeView parse = (TreeView) ta.getView(parseViewName);
Set<Feature> features = new LinkedHashSet<>();
List<Relation> incomingRelations = c.getIncomingRelations();
if (incomingRelations.size() > 0) {
Constituent c1, c2;
try {
c1 = parse.getParsePhrase(incomingRelations.get(0).getSource());
c2 = parse.getParsePhrase(c);
} catch (Exception e) {
throw new EdisonException(e);
}
Pair<List<Constituent>, List<Constituent>> paths = PathFeatureHelper.getPathsToCommonAncestor(c1, c2, 400);
List<Constituent> list = new ArrayList<>();
for (int i = 0; i < paths.getFirst().size() - 1; i++) {
list.add(paths.getFirst().get(i));
}
Constituent top = paths.getFirst().get(paths.getFirst().size() - 1);
list.add(top);
for (int i = paths.getSecond().size() - 2; i >= 0; i--) {
list.add(paths.getSecond().get(i));
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < paths.getFirst().size() - 1; i++) {
Constituent cc = paths.getFirst().get(i);
sb.append(cc.getLabel());
sb.append(PathFeatureHelper.PATH_UP_STRING);
}
String pathToAncestor = sb.toString();
String pathString = PathFeatureHelper.getPathString(paths, true, false);
features.add(DiscreteFeature.create(pathString));
features.add(DiscreteFeature.create(pathToAncestor));
features.add(RealFeature.create("l", list.size()));
}
return features;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class WordConjunctionOneTwoThreeGramWindowTwo method getFeatures.
@Override
public /**
* This feature extractor assumes that the TOKEN View has been generated in the Constituents TextAnnotation.
* It generate a feature for a window [-2, +2] of Forms (original text) for each constituent.
*/
Set<Feature> getFeatures(Constituent c) throws EdisonException {
TextAnnotation ta = c.getTextAnnotation();
TOKENS = ta.getView(ViewNames.TOKENS);
// We can assume that the constituent in this case is a Word(Token)
int startspan = c.getStartSpan();
int endspan = c.getEndSpan();
// k is 3 since we need up to 3-grams
int k = 3;
int window = 2;
// All our constituents are words(tokens)
String[] forms = getWindowK(TOKENS, startspan, endspan, window);
String id, value;
String classifier = "WordConjunctionOneTwoThreeGramWindowTwo";
Set<Feature> result = new LinkedHashSet<>();
for (int j = 0; j < k; j++) {
for (int i = 0; i < forms.length; i++) {
// forms.length = 5, So i goes from 0 to 4, for each String in the forms array.
StringBuilder f = new StringBuilder();
// and three word combinations within [-2,2] window or words.
for (int context = 0; context <= j && i + context < forms.length; context++) {
// add a '_' between words to conjoin them together
if (context != 0) {
f.append("_");
}
f.append(forms[i + context]);
}
// 2 is the center object in the array so i should go from -2 to +2 (with 0 being
// the center)
// j is the size of the n-gram so it goes 1 to 3
id = classifier + ":" + ((i - window) + "_" + (j + 1));
value = "(" + (f.toString()) + ")";
result.add(new DiscreteFeature(id + value));
}
}
return result;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class POSandPositionWindowThree method getFeatures.
@Override
public /**
* This feature extractor assumes that the TOKEN View and POS View have been
* generated in the Constituents TextAnnotation. It will use its own POS tag and well as the POS tag
* and the SHALLOW_PARSE (Chunk) labels of the previous two tokens and return it as a discrete feature.
*/
Set<Feature> getFeatures(Constituent c) throws EdisonException {
String classifier = "POSandPositionWindowThree";
TextAnnotation ta = c.getTextAnnotation();
TOKENS = ta.getView(ViewNames.TOKENS);
POS = ta.getView(ViewNames.POS);
// We can assume that the constituent in this case is a Word(Token) described by the LBJ
// chunk definition
int startspan = c.getStartSpan();
int endspan = c.getEndSpan();
int before = 3;
int after = 3;
// All our constituents are words(tokens)
String[] tags = new String[before + after + 1];
// three words before
int k = -3;
List<Constituent> wordsthreebefore = getwordskfrom(TOKENS, startspan, endspan, k);
int i = 0;
for (Constituent token : wordsthreebefore) {
// Should only be one POS tag for each token
List<String> POS_tag = POS.getLabelsCoveringSpan(token.getStartSpan(), token.getEndSpan());
if (POS_tag.size() != 1) {
logger.warn("Error token has more than one POS tag.");
}
tags[i] = POS_tag.get(0);
i++;
}
tags[i] = POS.getLabelsCoveringSpan(c.getStartSpan(), c.getEndSpan()).get(0);
i++;
// three words after
k = 3;
List<Constituent> wordsthreeafter = getwordskfrom(TOKENS, startspan, endspan, k);
for (Constituent token : wordsthreeafter) {
// Should only be one POS tag for each token
List<String> POS_tag = POS.getLabelsCoveringSpan(token.getStartSpan(), token.getEndSpan());
if (POS_tag.size() != 1) {
logger.info("Error token has more than one POS tag.");
}
tags[i] = POS_tag.get(0);
i++;
}
Set<Feature> __result = new LinkedHashSet<Feature>();
String __id;
String __value;
int contextmax = 3;
for (int j = 0; j < contextmax; j++) {
for (i = 0; i < tags.length; i++) {
StringBuffer f = new StringBuffer();
for (int context = 0; context <= j && i + context < tags.length; context++) {
if (context != 0) {
f.append("_");
}
f.append(tags[i + context]);
}
__id = "" + (i + "_" + j);
__value = "" + (f.toString());
__result.add(new DiscreteFeature(classifier + ":" + __id + "(" + __value + ")"));
}
}
return __result;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation in project cogcomp-nlp by CogComp.
the class CreateTrainDevTestSplit method main.
/**
* split an ERE corpus with 0.7/0.1/0.2 train/dev/test proportions, trying to balance
* all (or at least, lowest frequency) type count.
*
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
System.exit(-1);
}
EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
String corpusRoot = args[1];
String outDir = args[2];
ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
boolean throwExceptionOnXmlParserFail = false;
double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
// Path corpusPath = Paths.get(corpusRoot);
// String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
IOUtils.mkdir(outDir);
String outFileStem = outDir + "/";
// {ViewNames.EVENT_ERE};
String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
String[] labelsToCount = {};
EREMentionRelationReader reader = null;
try {
reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
Map<String, Set<View>> ereViews = new HashMap<>();
while (reader.hasNext()) {
XmlTextAnnotation xmlTextAnnotation = reader.next();
ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
Set<View> views = new HashSet<>();
TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
for (String viewName : viewNames) if (ta.hasView(viewName))
views.add(ta.getView(viewName));
ereViews.put(ta.getId(), views);
}
TextAnnotationLabelCounter lce = new TextAnnotationLabelCounter(labelsToCount.length == 0, labelsToCount, ereViews);
CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(lce);
Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
Map<String, Counter<String>> counts = creator.getExampleLabelCounts();
List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
for (String docId : counts.keySet()) {
outLines.add(docId + ": " + printCounts(counts.get(docId)));
}
for (Split s : splitCounts.keySet()) {
outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
}
Counter<String> totalLabelCounts = creator.getLabelTotals();
outLines.add("TOTALS: " + printCounts(totalLabelCounts));
try {
LineIO.write(outFileStem + "countInfo.txt", outLines);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
for (Split s : splits.keySet()) {
List<String> ids = new ArrayList<>(splits.get(s));
try {
LineIO.write(outFileStem + s.name() + ".txt", ids);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
}
Aggregations