use of edu.illinois.cs.cogcomp.edison.utilities.WordNetManager in project cogcomp-nlp by CogComp.
the class BIOCombinedReader method getTokensFromTAs.
private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException {
List<Constituent> ret = new ArrayList<>();
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters/brownBllipClusters");
bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
for (TextAnnotation ta : currentTas) {
View tokenView = ta.getView(ViewNames.TOKENS);
String mentionViewName = "";
if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
mentionViewName = ViewNames.MENTION_ACE;
} else {
mentionViewName = ViewNames.MENTION_ERE;
}
View mentionView = ta.getView(mentionViewName);
View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
String[] token2tags = new String[tokenView.getConstituents().size()];
for (int i = 0; i < token2tags.length; i++) {
token2tags[i] = "O";
}
for (Constituent c : mentionView.getConstituents()) {
if (!_type.equals("ALL")) {
String excludeType = _type;
if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
continue;
}
}
Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
if (!c.hasAttribute("EntityType")) {
c.addAttribute("EntityType", c.getLabel());
}
if (cHead == null) {
continue;
}
if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
// continue;
}
c.addAttribute("EntityType", "MENTION");
/**
* @Note that unlike BIOReader, the tagging schema is set to "BIOLU" here
*/
if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
} else {
token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
}
for (int i = 0; i < token2tags.length; i++) {
Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
Constituent newToken = curToken.cloneForNewView("BIO");
if (token2tags[i].equals("O")) {
newToken.addAttribute("BIO", token2tags[i]);
} else {
String[] group = token2tags[i].split(",");
String tag = group[0];
String eml = group[1];
newToken.addAttribute("BIO", tag);
newToken.addAttribute("EntityMentionType", eml);
}
newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, false));
newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
if (!newToken.toString().contains("http")) {
newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
} else {
newToken.addAttribute("WORDNETTAG", ",");
newToken.addAttribute("WORDNETHYM", ",");
}
if (_mode.contains("TRAIN")) {
newToken.addAttribute("isTraining", "true");
} else {
newToken.addAttribute("isTraining", "false");
}
bioView.addConstituent(newToken);
}
ta.addView("BIO", bioView);
for (Constituent c : bioView) {
ret.add(c);
}
}
return ret;
}
Aggregations