use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers in project cogcomp-nlp by CogComp.
the class BIOReader method getTokensFromTAs.
private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException {
List<Constituent> ret = new ArrayList<>();
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
String mentionViewName = "";
if (_mode.equals("ACE05")) {
mentionViewName = ViewNames.MENTION_ACE;
} else if (_mode.equals("ERE")) {
mentionViewName = ViewNames.MENTION_ERE;
} else if (_mode.equals("ColumnFormat")) {
mentionViewName = "MENTIONS";
} else {
System.out.println("No actions for undefined mode");
}
for (TextAnnotation ta : taList) {
View tokenView = ta.getView(ViewNames.TOKENS);
View mentionView = ta.getView(mentionViewName);
View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
String[] token2tags = new String[tokenView.getConstituents().size()];
for (int i = 0; i < token2tags.length; i++) {
token2tags[i] = "O";
}
for (Constituent c : mentionView.getConstituents()) {
if (!_type.equals("ALL")) {
String excludeType = _type;
if (_type.startsWith("SPE_")) {
excludeType = _type.substring(4);
}
if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
continue;
}
}
Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
if (_mode.equals("ERE")) {
c.addAttribute("EntityType", c.getLabel());
}
if (cHead == null) {
continue;
}
if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
// continue;
}
if (_isBIO) {
token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan(); i++) {
token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
} else {
if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
} else {
token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
}
}
for (int i = 0; i < token2tags.length; i++) {
Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
Constituent newToken = curToken.cloneForNewView("BIO");
if (token2tags[i].equals("O")) {
newToken.addAttribute("BIO", token2tags[i]);
} else {
String[] group = token2tags[i].split(",");
String tag = group[0];
String eml = group[1];
newToken.addAttribute("BIO", tag);
newToken.addAttribute("EntityMentionType", eml);
}
newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, _isBIO));
newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
if (!newToken.toString().contains("http")) {
newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
} else {
newToken.addAttribute("WORDNETTAG", ",");
newToken.addAttribute("WORDNETHYM", ",");
}
if (_binary_indicator.equals("TRAIN")) {
newToken.addAttribute("isTraining", "true");
} else {
newToken.addAttribute("isTraining", "false");
}
bioView.addConstituent(newToken);
}
ta.addView("BIO", bioView);
for (Constituent c : bioView) {
ret.add(c);
}
}
return ret;
}
use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers in project cogcomp-nlp by CogComp.
the class ExampleUsage method SemEvalAnnotate.
public static void SemEvalAnnotate() {
String text = "People have been moving back into downtown.";
String corpus = "semeval";
String textId = "001";
// Create a TextAnnotation From Text
TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
POSAnnotator pos_annotator = new POSAnnotator();
ChunkerAnnotator chunker = new ChunkerAnnotator(true);
chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
String modelPath = "";
FlatGazetteers gazetteers = null;
try {
ta.addView(pos_annotator);
chunker.addView(ta);
stanfordDepHandler.addView(ta);
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File model = ds.getDirectory("org.cogcomp.re", "SEMEVAL", 1.1, false);
modelPath = model.getPath();
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = (FlatGazetteers) GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
WordNetManager.loadConfigAsClasspathResource(true);
WordNetManager wordnet = WordNetManager.getInstance();
View annotatedTokenView = new SpanLabelView("RE_ANNOTATED", ta);
for (Constituent co : ta.getView(ViewNames.TOKENS).getConstituents()) {
Constituent c = co.cloneForNewView("RE_ANNOTATED");
for (String s : co.getAttributeKeys()) {
c.addAttribute(s, co.getAttribute(s));
}
c.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordnet, c));
c.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordnet, c));
annotatedTokenView.addConstituent(c);
}
ta.addView("RE_ANNOTATED", annotatedTokenView);
} catch (Exception e) {
e.printStackTrace();
}
Constituent source = new Constituent("first", "Mention", ta, 0, 1);
Constituent target = new Constituent("second", "Mention", ta, 6, 7);
source.addAttribute("GAZ", gazetteers.annotatePhrase(source));
target.addAttribute("GAZ", gazetteers.annotatePhrase(target));
Relation relation = new Relation("TEST", source, target, 1.0f);
String prefix = modelPath + File.separator + "SEMEVAL" + File.separator + "SEMEVAL";
semeval_relation_classifier classifier = new semeval_relation_classifier(prefix + ".lc", prefix + ".lex");
String tag = classifier.discreteValue(relation);
System.out.println(tag);
}
use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers in project cogcomp-nlp by CogComp.
the class BIOCombinedReader method getTokensFromTAs.
private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException {
List<Constituent> ret = new ArrayList<>();
WordNetManager wordNet = null;
Gazetteers gazetteers = null;
BrownClusters brownClusters = null;
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
Vector<String> bcs = new Vector<>();
bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
bcs.add("brown-clusters/brownBllipClusters");
bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
Vector<Integer> bcst = new Vector<>();
bcst.add(5);
bcst.add(5);
bcst.add(5);
Vector<Boolean> bcsl = new Vector<>();
bcsl.add(false);
bcsl.add(false);
bcsl.add(false);
brownClusters = BrownClusters.get(bcs, bcst, bcsl);
WordNetManager.loadConfigAsClasspathResource(true);
wordNet = WordNetManager.getInstance();
for (TextAnnotation ta : currentTas) {
View tokenView = ta.getView(ViewNames.TOKENS);
String mentionViewName = "";
if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
mentionViewName = ViewNames.MENTION_ACE;
} else {
mentionViewName = ViewNames.MENTION_ERE;
}
View mentionView = ta.getView(mentionViewName);
View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
String[] token2tags = new String[tokenView.getConstituents().size()];
for (int i = 0; i < token2tags.length; i++) {
token2tags[i] = "O";
}
for (Constituent c : mentionView.getConstituents()) {
if (!_type.equals("ALL")) {
String excludeType = _type;
if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
continue;
}
}
Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
if (!c.hasAttribute("EntityType")) {
c.addAttribute("EntityType", c.getLabel());
}
if (cHead == null) {
continue;
}
if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
// continue;
}
c.addAttribute("EntityType", "MENTION");
/**
* @Note that unlike BIOReader, the tagging schema is set to "BIOLU" here
*/
if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
} else {
token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
}
}
for (int i = 0; i < token2tags.length; i++) {
Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
Constituent newToken = curToken.cloneForNewView("BIO");
if (token2tags[i].equals("O")) {
newToken.addAttribute("BIO", token2tags[i]);
} else {
String[] group = token2tags[i].split(",");
String tag = group[0];
String eml = group[1];
newToken.addAttribute("BIO", tag);
newToken.addAttribute("EntityMentionType", eml);
}
newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, false));
newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
if (!newToken.toString().contains("http")) {
newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
} else {
newToken.addAttribute("WORDNETTAG", ",");
newToken.addAttribute("WORDNETHYM", ",");
}
if (_mode.contains("TRAIN")) {
newToken.addAttribute("isTraining", "true");
} else {
newToken.addAttribute("isTraining", "false");
}
bioView.addConstituent(newToken);
}
ta.addView("BIO", bioView);
for (Constituent c : bioView) {
ret.add(c);
}
}
return ret;
}
Aggregations