use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters in project cogcomp-nlp by CogComp.
the class BIOReader method getTokensFromTAs.
private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException {
List<Constituent> ret = new ArrayList<>();
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
String mentionViewName = "";
if (_mode.equals("ACE05")) {
mentionViewName = ViewNames.MENTION_ACE;
} else if (_mode.equals("ERE")) {
mentionViewName = ViewNames.MENTION_ERE;
} else if (_mode.equals("ColumnFormat")) {
mentionViewName = "MENTIONS";
} else {
System.out.println("No actions for undefined mode");
}
for (TextAnnotation ta : taList) {
View tokenView = ta.getView(ViewNames.TOKENS);
View mentionView = ta.getView(mentionViewName);
View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
String[] token2tags = new String[tokenView.getConstituents().size()];
for (int i = 0; i < token2tags.length; i++) {
token2tags[i] = "O";
}
for (Constituent c : mentionView.getConstituents()) {
if (!_type.equals("ALL")) {
String excludeType = _type;
if (_type.startsWith("SPE_")) {
excludeType = _type.substring(4);
}
if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
continue;
}
}
Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
if (_mode.equals("ERE")) {
c.addAttribute("EntityType", c.getLabel());
}
if (cHead == null) {
continue;
}
if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
// continue;
}
if (_isBIO) {
token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan(); i++) {
token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
} else {
if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
} else {
token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
}
}
for (int i = 0; i < token2tags.length; i++) {
Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
Constituent newToken = curToken.cloneForNewView("BIO");
if (token2tags[i].equals("O")) {
newToken.addAttribute("BIO", token2tags[i]);
} else {
String[] group = token2tags[i].split(",");
String tag = group[0];
String eml = group[1];
newToken.addAttribute("BIO", tag);
newToken.addAttribute("EntityMentionType", eml);
}
newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, _isBIO));
newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
if (!newToken.toString().contains("http")) {
newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
} else {
newToken.addAttribute("WORDNETTAG", ",");
newToken.addAttribute("WORDNETHYM", ",");
}
if (_binary_indicator.equals("TRAIN")) {
newToken.addAttribute("isTraining", "true");
} else {
newToken.addAttribute("isTraining", "false");
}
bioView.addConstituent(newToken);
}
ta.addView("BIO", bioView);
for (Constituent c : bioView) {
ret.add(c);
}
}
return ret;
}
use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters in project cogcomp-nlp by CogComp.
the class ExtentReader method getPairs.
public List<Relation> getPairs() {
List<Relation> ret = new ArrayList<>();
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
try {
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters/brownBllipClusters");
bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
} catch (Exception e) {
e.printStackTrace();
}
for (TextAnnotation ta : taList) {
String mentionViewName = ViewNames.MENTION_ERE;
if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
mentionViewName = ViewNames.MENTION_ACE;
}
View mentionView = ta.getView(mentionViewName);
View tokenView = ta.getView(ViewNames.TOKENS);
for (Constituent mention : mentionView) {
Constituent head = ACEReader.getEntityHeadForConstituent(mention, ta, "HEADS");
if (head == null) {
continue;
}
if (!head.hasAttribute("EntityType")) {
head.addAttribute("EntityType", head.getLabel());
}
ExtentTester.addHeadAttributes(head, gazetteers, brownClusters, wordNet);
for (int i = mention.getStartSpan(); i < mention.getEndSpan(); i++) {
if (i >= head.getStartSpan() && i < head.getEndSpan()) {
continue;
}
Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
ExtentTester.addExtentAttributes(curToken, gazetteers, brownClusters, wordNet);
Relation R = new Relation("true", curToken, head, 1.0f);
ret.add(R);
}
if (mention.getStartSpan() > 0) {
Constituent curToken = tokenView.getConstituentsCoveringToken(mention.getStartSpan() - 1).get(0);
ExtentTester.addExtentAttributes(curToken, gazetteers, brownClusters, wordNet);
Relation falseR = new Relation("false", curToken, head, 1.0f);
ret.add(falseR);
}
if (mention.getEndSpan() < tokenView.getEndSpan()) {
Constituent curToken = tokenView.getConstituentsCoveringToken(mention.getEndSpan()).get(0);
ExtentTester.addExtentAttributes(curToken, gazetteers, brownClusters, wordNet);
Relation falseR = new Relation("false", curToken, head, 1.0f);
ret.add(falseR);
}
}
}
return ret;
}
use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters in project cogcomp-nlp by CogComp.
the class ExtentTester method testExtentOnPredictedHead.
public static void testExtentOnPredictedHead() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException {
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
try {
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
} catch (Exception e) {
e.printStackTrace();
}
int total_mention_predicted = 0;
int total_mention_labeled = 0;
int total_mention_head_correct = 0;
int total_mention_extent_correct = 0;
for (int i = 0; i < 5; i++) {
BIOReader h_train_parser_nam = new BIOReader("data/partition_with_dev/train/" + i, "ACE05-TRAIN", "NAM", false);
BIOReader h_train_parser_nom = new BIOReader("data/partition_with_dev/train/" + i, "ACE05-TRAIN", "NOM", false);
BIOReader h_train_parser_pro = new BIOReader("data/partition_with_dev/train/" + i, "ACE05-TRAIN", "PRO", false);
bio_classifier_nam h_classifier_nam = BIOTester.train_nam_classifier(h_train_parser_nam);
bio_classifier_nom h_classifier_nom = BIOTester.train_nom_classifier(h_train_parser_nom);
bio_classifier_pro h_classifier_pro = BIOTester.train_pro_classifier(h_train_parser_pro);
Learner[] h_candidates = new Learner[3];
h_candidates[0] = h_classifier_nam;
h_candidates[1] = h_classifier_nom;
h_candidates[2] = h_classifier_pro;
ExtentReader e_train_parser = new ExtentReader("data/partition_with_dev/train/" + i);
extent_classifier e_classifier = train_extent_classifier(e_train_parser);
BIOReader test_parser = new BIOReader("data/partition_with_dev/eval/" + i, "ACE05-EVAL", "ALL", false);
test_parser.reset();
String preBIOLevel1 = "";
String preBIOLevel2 = "";
List<Constituent> predictedHeads = new ArrayList<>();
List<Constituent> predictedMentions = new ArrayList<>();
for (Object example = test_parser.next(); example != null; example = test_parser.next()) {
((Constituent) example).addAttribute("preBIOLevel1", preBIOLevel1);
((Constituent) example).addAttribute("preBIOLevel2", preBIOLevel2);
Pair<String, Integer> h_prediction = BIOTester.joint_inference((Constituent) example, h_candidates);
String bioTag = h_prediction.getFirst();
if (bioTag.startsWith("B") || bioTag.startsWith("U")) {
Constituent predictMention = BIOTester.getConstituent((Constituent) example, h_candidates[h_prediction.getSecond()], false);
predictedHeads.add(predictMention);
}
preBIOLevel2 = preBIOLevel1;
preBIOLevel1 = bioTag;
}
for (Constituent head : predictedHeads) {
Constituent mention = getFullMention(e_classifier, head, gazetteers, brownClusters, wordNet);
predictedMentions.add(mention);
}
List<Constituent> goldMentions = new ArrayList<>();
ACEReader aceReader = null;
try {
aceReader = new ACEReader("data/partition_with_dev/eval/" + i, false);
} catch (Exception e) {
e.printStackTrace();
}
for (TextAnnotation ta : aceReader) {
goldMentions.addAll(ta.getView(ViewNames.MENTION_ACE).getConstituents());
}
total_mention_labeled += goldMentions.size();
total_mention_predicted += predictedMentions.size();
for (Constituent p : predictedMentions) {
Constituent ph = getPredictedMentionHead(p);
for (Constituent g : goldMentions) {
if (!p.getTextAnnotation().getText().equals(g.getTextAnnotation().getText())) {
continue;
}
Constituent gh = ACEReader.getEntityHeadForConstituent(g, g.getTextAnnotation(), "TESTG");
try {
if (ph.getStartSpan() == gh.getStartSpan() && ph.getEndSpan() == gh.getEndSpan()) {
total_mention_head_correct++;
if (g.getStartSpan() == p.getStartSpan() && g.getEndSpan() == p.getEndSpan()) {
total_mention_extent_correct++;
}
break;
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
System.out.println("Total labeled mention: " + total_mention_labeled);
System.out.println("Total predicted mention: " + total_mention_predicted);
System.out.println("Total head correct: " + total_mention_head_correct);
System.out.println("Total extent correct: " + total_mention_extent_correct);
}
use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters in project cogcomp-nlp by CogComp.
the class ExtentTester method testExtentOnGoldHead.
public static void testExtentOnGoldHead() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException {
int labeled = 0;
int correct = 0;
POSAnnotator posAnnotator = null;
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
try {
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
posAnnotator = new POSAnnotator();
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
} catch (Exception e) {
e.printStackTrace();
}
for (int i = 0; i < 1; i++) {
ExtentReader train_parser = new ExtentReader("data/partition_with_dev/train/" + i, "COMBINED-ALL-TRAIN-" + i);
extent_classifier classifier = train_extent_classifier(train_parser);
BIOCombinedReader bioCombinedReader = null;
try {
bioCombinedReader = new BIOCombinedReader(i, "ALL-EVAL", "ALL", true);
} catch (Exception e) {
e.printStackTrace();
}
for (Object ota = bioCombinedReader.next(); ota != null; ota = bioCombinedReader.next()) {
TextAnnotation ta = (TextAnnotation) ota;
try {
ta.addView(posAnnotator);
} catch (Exception e) {
e.printStackTrace();
}
String mentionViewName = ViewNames.MENTION_ERE;
if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
mentionViewName = ViewNames.MENTION_ACE;
}
View mentionView = ta.getView(mentionViewName);
for (Constituent mention : mentionView.getConstituents()) {
Constituent head = ACEReader.getEntityHeadForConstituent(mention, ta, "HEADS");
if (head == null) {
continue;
}
labeled++;
Constituent predictedFullMention = getFullMention(classifier, head, gazetteers, brownClusters, wordNet);
if (predictedFullMention.getStartSpan() == mention.getStartSpan() && predictedFullMention.getEndSpan() == mention.getEndSpan()) {
correct++;
} else {
System.out.println("Gold: " + mention.toString());
System.out.println("Predicted: " + predictedFullMention.toString());
}
}
}
}
System.out.println("Labeled: " + labeled);
System.out.println("Correct: " + correct);
System.out.println("Correctness: " + (double) correct * 100.0 / (double) labeled);
}
use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters in project cogcomp-nlp by CogComp.
the class BIOCombinedReader method getTokensFromTAs.
private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException {
List<Constituent> ret = new ArrayList<>();
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters/brownBllipClusters");
bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
for (TextAnnotation ta : currentTas) {
View tokenView = ta.getView(ViewNames.TOKENS);
String mentionViewName = "";
if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
mentionViewName = ViewNames.MENTION_ACE;
} else {
mentionViewName = ViewNames.MENTION_ERE;
}
View mentionView = ta.getView(mentionViewName);
View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
String[] token2tags = new String[tokenView.getConstituents().size()];
for (int i = 0; i < token2tags.length; i++) {
token2tags[i] = "O";
}
for (Constituent c : mentionView.getConstituents()) {
if (!_type.equals("ALL")) {
String excludeType = _type;
if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
continue;
}
}
Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
if (!c.hasAttribute("EntityType")) {
c.addAttribute("EntityType", c.getLabel());
}
if (cHead == null) {
continue;
}
if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
// continue;
}
c.addAttribute("EntityType", "MENTION");
/**
* @Note that unlike BIOReader, the tagging schema is set to "BIOLU" here
*/
if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
} else {
token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
}
for (int i = 0; i < token2tags.length; i++) {
Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
Constituent newToken = curToken.cloneForNewView("BIO");
if (token2tags[i].equals("O")) {
newToken.addAttribute("BIO", token2tags[i]);
} else {
String[] group = token2tags[i].split(",");
String tag = group[0];
String eml = group[1];
newToken.addAttribute("BIO", tag);
newToken.addAttribute("EntityMentionType", eml);
}
newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, false));
newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
if (!newToken.toString().contains("http")) {
newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
} else {
newToken.addAttribute("WORDNETTAG", ",");
newToken.addAttribute("WORDNETHYM", ",");
}
if (_mode.contains("TRAIN")) {
newToken.addAttribute("isTraining", "true");
} else {
newToken.addAttribute("isTraining", "false");
}
bioView.addConstituent(newToken);
}
ta.addView("BIO", bioView);
for (Constituent c : bioView) {
ret.add(c);
}
}
return ret;
}
Aggregations