Search in sources :

Example 1 with FlatGazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers in project cogcomp-nlp by CogComp.

the class BIOReader method getTokensFromTAs.

private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException {
    List<Constituent> ret = new ArrayList<>();
    WordNetManager wordNet = null;
    Gazetteers gazetteers = null;
    BrownClusters brownClusters = null;
    Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
    File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
    gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
    Vector<String> bcs = new Vector<>();
    bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
    bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
    bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
    Vector<Integer> bcst = new Vector<>();
    bcst.add(5);
    bcst.add(5);
    bcst.add(5);
    Vector<Boolean> bcsl = new Vector<>();
    bcsl.add(false);
    bcsl.add(false);
    bcsl.add(false);
    brownClusters = BrownClusters.get(bcs, bcst, bcsl);
    WordNetManager.loadConfigAsClasspathResource(true);
    wordNet = WordNetManager.getInstance();
    String mentionViewName = "";
    if (_mode.equals("ACE05")) {
        mentionViewName = ViewNames.MENTION_ACE;
    } else if (_mode.equals("ERE")) {
        mentionViewName = ViewNames.MENTION_ERE;
    } else if (_mode.equals("ColumnFormat")) {
        mentionViewName = "MENTIONS";
    } else {
        System.out.println("No actions for undefined mode");
    }
    for (TextAnnotation ta : taList) {
        View tokenView = ta.getView(ViewNames.TOKENS);
        View mentionView = ta.getView(mentionViewName);
        View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
        String[] token2tags = new String[tokenView.getConstituents().size()];
        for (int i = 0; i < token2tags.length; i++) {
            token2tags[i] = "O";
        }
        for (Constituent c : mentionView.getConstituents()) {
            if (!_type.equals("ALL")) {
                String excludeType = _type;
                if (_type.startsWith("SPE_")) {
                    excludeType = _type.substring(4);
                }
                if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
                    continue;
                }
            }
            Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
            if (_mode.equals("ERE")) {
                c.addAttribute("EntityType", c.getLabel());
            }
            if (cHead == null) {
                continue;
            }
            if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
            // continue;
            }
            if (_isBIO) {
                token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan(); i++) {
                    token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                }
            } else {
                if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
                    token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                } else {
                    token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                    for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
                        token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                    }
                    token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                }
            }
        }
        for (int i = 0; i < token2tags.length; i++) {
            Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
            Constituent newToken = curToken.cloneForNewView("BIO");
            if (token2tags[i].equals("O")) {
                newToken.addAttribute("BIO", token2tags[i]);
            } else {
                String[] group = token2tags[i].split(",");
                String tag = group[0];
                String eml = group[1];
                newToken.addAttribute("BIO", tag);
                newToken.addAttribute("EntityMentionType", eml);
            }
            newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, _isBIO));
            newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
            if (!newToken.toString().contains("http")) {
                newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
                newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
            } else {
                newToken.addAttribute("WORDNETTAG", ",");
                newToken.addAttribute("WORDNETHYM", ",");
            }
            if (_binary_indicator.equals("TRAIN")) {
                newToken.addAttribute("isTraining", "true");
            } else {
                newToken.addAttribute("isTraining", "false");
            }
            bioView.addConstituent(newToken);
        }
        ta.addView("BIO", bioView);
        for (Constituent c : bioView) {
            ret.add(c);
        }
    }
    return ret;
}
Also used : ArrayList(java.util.ArrayList) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) Gazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers) FlatGazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers) Datastore(org.cogcomp.Datastore) BrownClusters(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters) File(java.io.File) Vector(java.util.Vector)

Example 2 with FlatGazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers in project cogcomp-nlp by CogComp.

the class ExampleUsage method SemEvalAnnotate.

public static void SemEvalAnnotate() {
    String text = "People have been moving back into downtown.";
    String corpus = "semeval";
    String textId = "001";
    // Create a TextAnnotation From Text
    TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
    TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
    POSAnnotator pos_annotator = new POSAnnotator();
    ChunkerAnnotator chunker = new ChunkerAnnotator(true);
    chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
    Properties stanfordProps = new Properties();
    stanfordProps.put("annotators", "pos, parse");
    stanfordProps.put("parse.originalDependencies", true);
    stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
    stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
    POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
    ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
    StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
    String modelPath = "";
    FlatGazetteers gazetteers = null;
    try {
        ta.addView(pos_annotator);
        chunker.addView(ta);
        stanfordDepHandler.addView(ta);
        Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File model = ds.getDirectory("org.cogcomp.re", "SEMEVAL", 1.1, false);
        modelPath = model.getPath();
        File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
        gazetteers = (FlatGazetteers) GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
        WordNetManager.loadConfigAsClasspathResource(true);
        WordNetManager wordnet = WordNetManager.getInstance();
        View annotatedTokenView = new SpanLabelView("RE_ANNOTATED", ta);
        for (Constituent co : ta.getView(ViewNames.TOKENS).getConstituents()) {
            Constituent c = co.cloneForNewView("RE_ANNOTATED");
            for (String s : co.getAttributeKeys()) {
                c.addAttribute(s, co.getAttribute(s));
            }
            c.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordnet, c));
            c.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordnet, c));
            annotatedTokenView.addConstituent(c);
        }
        ta.addView("RE_ANNOTATED", annotatedTokenView);
    } catch (Exception e) {
        e.printStackTrace();
    }
    Constituent source = new Constituent("first", "Mention", ta, 0, 1);
    Constituent target = new Constituent("second", "Mention", ta, 6, 7);
    source.addAttribute("GAZ", gazetteers.annotatePhrase(source));
    target.addAttribute("GAZ", gazetteers.annotatePhrase(target));
    Relation relation = new Relation("TEST", source, target, 1.0f);
    String prefix = modelPath + File.separator + "SEMEVAL" + File.separator + "SEMEVAL";
    semeval_relation_classifier classifier = new semeval_relation_classifier(prefix + ".lc", prefix + ".lex");
    String tag = classifier.discreteValue(relation);
    System.out.println(tag);
}
Also used : ChunkerConfigurator(edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator) TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) ParserAnnotator(edu.stanford.nlp.pipeline.ParserAnnotator) POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) Properties(java.util.Properties) LbjGen.semeval_relation_classifier(org.cogcomp.re.LbjGen.semeval_relation_classifier) FlatGazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers) ChunkerAnnotator(edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator) POSTaggerAnnotator(edu.stanford.nlp.pipeline.POSTaggerAnnotator) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) Datastore(org.cogcomp.Datastore) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) File(java.io.File) StanfordDepHandler(edu.illinois.cs.cogcomp.pipeline.handlers.StanfordDepHandler)

Example 3 with FlatGazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers in project cogcomp-nlp by CogComp.

the class BIOCombinedReader method getTokensFromTAs.

private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException {
    List<Constituent> ret = new ArrayList<>();
    WordNetManager wordNet = null;
    Gazetteers gazetteers = null;
    BrownClusters brownClusters = null;
    Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
    File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
    gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
    Vector<String> bcs = new Vector<>();
    bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
    bcs.add("brown-clusters/brownBllipClusters");
    bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
    Vector<Integer> bcst = new Vector<>();
    bcst.add(5);
    bcst.add(5);
    bcst.add(5);
    Vector<Boolean> bcsl = new Vector<>();
    bcsl.add(false);
    bcsl.add(false);
    bcsl.add(false);
    brownClusters = BrownClusters.get(bcs, bcst, bcsl);
    WordNetManager.loadConfigAsClasspathResource(true);
    wordNet = WordNetManager.getInstance();
    for (TextAnnotation ta : currentTas) {
        View tokenView = ta.getView(ViewNames.TOKENS);
        String mentionViewName = "";
        if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
            mentionViewName = ViewNames.MENTION_ACE;
        } else {
            mentionViewName = ViewNames.MENTION_ERE;
        }
        View mentionView = ta.getView(mentionViewName);
        View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
        String[] token2tags = new String[tokenView.getConstituents().size()];
        for (int i = 0; i < token2tags.length; i++) {
            token2tags[i] = "O";
        }
        for (Constituent c : mentionView.getConstituents()) {
            if (!_type.equals("ALL")) {
                String excludeType = _type;
                if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
                    continue;
                }
            }
            Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
            if (!c.hasAttribute("EntityType")) {
                c.addAttribute("EntityType", c.getLabel());
            }
            if (cHead == null) {
                continue;
            }
            if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
            // continue;
            }
            c.addAttribute("EntityType", "MENTION");
            /**
             * @Note that unlike BIOReader, the tagging schema is set to "BIOLU" here
             */
            if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
                token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
            } else {
                token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
                    token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                }
                token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
            }
        }
        for (int i = 0; i < token2tags.length; i++) {
            Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
            Constituent newToken = curToken.cloneForNewView("BIO");
            if (token2tags[i].equals("O")) {
                newToken.addAttribute("BIO", token2tags[i]);
            } else {
                String[] group = token2tags[i].split(",");
                String tag = group[0];
                String eml = group[1];
                newToken.addAttribute("BIO", tag);
                newToken.addAttribute("EntityMentionType", eml);
            }
            newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, false));
            newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
            if (!newToken.toString().contains("http")) {
                newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
                newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
            } else {
                newToken.addAttribute("WORDNETTAG", ",");
                newToken.addAttribute("WORDNETHYM", ",");
            }
            if (_mode.contains("TRAIN")) {
                newToken.addAttribute("isTraining", "true");
            } else {
                newToken.addAttribute("isTraining", "false");
            }
            bioView.addConstituent(newToken);
        }
        ta.addView("BIO", bioView);
        for (Constituent c : bioView) {
            ret.add(c);
        }
    }
    return ret;
}
Also used : ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) Gazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers) FlatGazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers) Datastore(org.cogcomp.Datastore) BrownClusters(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters)

Aggregations

ResourceConfigurator (edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator)3 WordNetManager (edu.illinois.cs.cogcomp.edison.utilities.WordNetManager)3 FlatGazetteers (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers)3 Datastore (org.cogcomp.Datastore)3 BrownClusters (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters)2 Gazetteers (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers)2 File (java.io.File)2 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)1 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)1 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)1 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)1 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)1 StanfordDepHandler (edu.illinois.cs.cogcomp.pipeline.handlers.StanfordDepHandler)1 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)1 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)1 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)1 ArrayList (java.util.ArrayList)1 Properties (java.util.Properties)1 Vector (java.util.Vector)1 LbjGen.semeval_relation_classifier (org.cogcomp.re.LbjGen.semeval_relation_classifier)1