Search in sources :

Example 1 with Gazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers in project cogcomp-nlp by CogComp.

the class BIOReader method getTokensFromTAs.

private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException {
    List<Constituent> ret = new ArrayList<>();
    WordNetManager wordNet = null;
    Gazetteers gazetteers = null;
    BrownClusters brownClusters = null;
    Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
    File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
    gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
    Vector<String> bcs = new Vector<>();
    bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
    bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
    bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
    Vector<Integer> bcst = new Vector<>();
    bcst.add(5);
    bcst.add(5);
    bcst.add(5);
    Vector<Boolean> bcsl = new Vector<>();
    bcsl.add(false);
    bcsl.add(false);
    bcsl.add(false);
    brownClusters = BrownClusters.get(bcs, bcst, bcsl);
    WordNetManager.loadConfigAsClasspathResource(true);
    wordNet = WordNetManager.getInstance();
    String mentionViewName = "";
    if (_mode.equals("ACE05")) {
        mentionViewName = ViewNames.MENTION_ACE;
    } else if (_mode.equals("ERE")) {
        mentionViewName = ViewNames.MENTION_ERE;
    } else if (_mode.equals("ColumnFormat")) {
        mentionViewName = "MENTIONS";
    } else {
        System.out.println("No actions for undefined mode");
    }
    for (TextAnnotation ta : taList) {
        View tokenView = ta.getView(ViewNames.TOKENS);
        View mentionView = ta.getView(mentionViewName);
        View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
        String[] token2tags = new String[tokenView.getConstituents().size()];
        for (int i = 0; i < token2tags.length; i++) {
            token2tags[i] = "O";
        }
        for (Constituent c : mentionView.getConstituents()) {
            if (!_type.equals("ALL")) {
                String excludeType = _type;
                if (_type.startsWith("SPE_")) {
                    excludeType = _type.substring(4);
                }
                if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
                    continue;
                }
            }
            Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
            if (_mode.equals("ERE")) {
                c.addAttribute("EntityType", c.getLabel());
            }
            if (cHead == null) {
                continue;
            }
            if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
            // continue;
            }
            if (_isBIO) {
                token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan(); i++) {
                    token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                }
            } else {
                if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
                    token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                } else {
                    token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                    for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
                        token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                    }
                    token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                }
            }
        }
        for (int i = 0; i < token2tags.length; i++) {
            Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
            Constituent newToken = curToken.cloneForNewView("BIO");
            if (token2tags[i].equals("O")) {
                newToken.addAttribute("BIO", token2tags[i]);
            } else {
                String[] group = token2tags[i].split(",");
                String tag = group[0];
                String eml = group[1];
                newToken.addAttribute("BIO", tag);
                newToken.addAttribute("EntityMentionType", eml);
            }
            newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, _isBIO));
            newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
            if (!newToken.toString().contains("http")) {
                newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
                newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
            } else {
                newToken.addAttribute("WORDNETTAG", ",");
                newToken.addAttribute("WORDNETHYM", ",");
            }
            if (_binary_indicator.equals("TRAIN")) {
                newToken.addAttribute("isTraining", "true");
            } else {
                newToken.addAttribute("isTraining", "false");
            }
            bioView.addConstituent(newToken);
        }
        ta.addView("BIO", bioView);
        for (Constituent c : bioView) {
            ret.add(c);
        }
    }
    return ret;
}
Also used : ArrayList(java.util.ArrayList) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) Gazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers) FlatGazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers) Datastore(org.cogcomp.Datastore) BrownClusters(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters) File(java.io.File) Vector(java.util.Vector)

Example 2 with Gazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers in project cogcomp-nlp by CogComp.

the class ExtentReader method getPairs.

public List<Relation> getPairs() {
    List<Relation> ret = new ArrayList<>();
    WordNetManager wordNet = null;
    Gazetteers gazetteers = null;
    BrownClusters brownClusters = null;
    try {
        WordNetManager.loadConfigAsClasspathResource(true);
        wordNet = WordNetManager.getInstance();
        Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
        gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
        Vector<String> bcs = new Vector<>();
        bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
        bcs.add("brown-clusters/brownBllipClusters");
        bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
        Vector<Integer> bcst = new Vector<>();
        bcst.add(5);
        bcst.add(5);
        bcst.add(5);
        Vector<Boolean> bcsl = new Vector<>();
        bcsl.add(false);
        bcsl.add(false);
        bcsl.add(false);
        brownClusters = BrownClusters.get(bcs, bcst, bcsl);
    } catch (Exception e) {
        e.printStackTrace();
    }
    for (TextAnnotation ta : taList) {
        String mentionViewName = ViewNames.MENTION_ERE;
        if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
            mentionViewName = ViewNames.MENTION_ACE;
        }
        View mentionView = ta.getView(mentionViewName);
        View tokenView = ta.getView(ViewNames.TOKENS);
        for (Constituent mention : mentionView) {
            Constituent head = ACEReader.getEntityHeadForConstituent(mention, ta, "HEADS");
            if (head == null) {
                continue;
            }
            if (!head.hasAttribute("EntityType")) {
                head.addAttribute("EntityType", head.getLabel());
            }
            ExtentTester.addHeadAttributes(head, gazetteers, brownClusters, wordNet);
            for (int i = mention.getStartSpan(); i < mention.getEndSpan(); i++) {
                if (i >= head.getStartSpan() && i < head.getEndSpan()) {
                    continue;
                }
                Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
                ExtentTester.addExtentAttributes(curToken, gazetteers, brownClusters, wordNet);
                Relation R = new Relation("true", curToken, head, 1.0f);
                ret.add(R);
            }
            if (mention.getStartSpan() > 0) {
                Constituent curToken = tokenView.getConstituentsCoveringToken(mention.getStartSpan() - 1).get(0);
                ExtentTester.addExtentAttributes(curToken, gazetteers, brownClusters, wordNet);
                Relation falseR = new Relation("false", curToken, head, 1.0f);
                ret.add(falseR);
            }
            if (mention.getEndSpan() < tokenView.getEndSpan()) {
                Constituent curToken = tokenView.getConstituentsCoveringToken(mention.getEndSpan()).get(0);
                ExtentTester.addExtentAttributes(curToken, gazetteers, brownClusters, wordNet);
                Relation falseR = new Relation("false", curToken, head, 1.0f);
                ret.add(falseR);
            }
        }
    }
    return ret;
}
Also used : ArrayList(java.util.ArrayList) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) DatastoreException(org.cogcomp.DatastoreException) JWNLException(net.didion.jwnl.JWNLException) IOException(java.io.IOException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) InvalidPortException(io.minio.errors.InvalidPortException) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) Gazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers) Datastore(org.cogcomp.Datastore) BrownClusters(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters) File(java.io.File) Vector(java.util.Vector)

Example 3 with Gazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers in project cogcomp-nlp by CogComp.

the class ExtentTester method testExtentOnPredictedHead.

public static void testExtentOnPredictedHead() throws InvalidPortException, InvalidEndpointException, DatastoreException, IOException, JWNLException {
    WordNetManager wordNet = null;
    Gazetteers gazetteers = null;
    BrownClusters brownClusters = null;
    try {
        WordNetManager.loadConfigAsClasspathResource(true);
        wordNet = WordNetManager.getInstance();
        Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
        gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
        Vector<String> bcs = new Vector<>();
        bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
        bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
        bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
        Vector<Integer> bcst = new Vector<>();
        bcst.add(5);
        bcst.add(5);
        bcst.add(5);
        Vector<Boolean> bcsl = new Vector<>();
        bcsl.add(false);
        bcsl.add(false);
        bcsl.add(false);
        brownClusters = BrownClusters.get(bcs, bcst, bcsl);
    } catch (Exception e) {
        e.printStackTrace();
    }
    int total_mention_predicted = 0;
    int total_mention_labeled = 0;
    int total_mention_head_correct = 0;
    int total_mention_extent_correct = 0;
    for (int i = 0; i < 5; i++) {
        BIOReader h_train_parser_nam = new BIOReader("data/partition_with_dev/train/" + i, "ACE05-TRAIN", "NAM", false);
        BIOReader h_train_parser_nom = new BIOReader("data/partition_with_dev/train/" + i, "ACE05-TRAIN", "NOM", false);
        BIOReader h_train_parser_pro = new BIOReader("data/partition_with_dev/train/" + i, "ACE05-TRAIN", "PRO", false);
        bio_classifier_nam h_classifier_nam = BIOTester.train_nam_classifier(h_train_parser_nam);
        bio_classifier_nom h_classifier_nom = BIOTester.train_nom_classifier(h_train_parser_nom);
        bio_classifier_pro h_classifier_pro = BIOTester.train_pro_classifier(h_train_parser_pro);
        Learner[] h_candidates = new Learner[3];
        h_candidates[0] = h_classifier_nam;
        h_candidates[1] = h_classifier_nom;
        h_candidates[2] = h_classifier_pro;
        ExtentReader e_train_parser = new ExtentReader("data/partition_with_dev/train/" + i);
        extent_classifier e_classifier = train_extent_classifier(e_train_parser);
        BIOReader test_parser = new BIOReader("data/partition_with_dev/eval/" + i, "ACE05-EVAL", "ALL", false);
        test_parser.reset();
        String preBIOLevel1 = "";
        String preBIOLevel2 = "";
        List<Constituent> predictedHeads = new ArrayList<>();
        List<Constituent> predictedMentions = new ArrayList<>();
        for (Object example = test_parser.next(); example != null; example = test_parser.next()) {
            ((Constituent) example).addAttribute("preBIOLevel1", preBIOLevel1);
            ((Constituent) example).addAttribute("preBIOLevel2", preBIOLevel2);
            Pair<String, Integer> h_prediction = BIOTester.joint_inference((Constituent) example, h_candidates);
            String bioTag = h_prediction.getFirst();
            if (bioTag.startsWith("B") || bioTag.startsWith("U")) {
                Constituent predictMention = BIOTester.getConstituent((Constituent) example, h_candidates[h_prediction.getSecond()], false);
                predictedHeads.add(predictMention);
            }
            preBIOLevel2 = preBIOLevel1;
            preBIOLevel1 = bioTag;
        }
        for (Constituent head : predictedHeads) {
            Constituent mention = getFullMention(e_classifier, head, gazetteers, brownClusters, wordNet);
            predictedMentions.add(mention);
        }
        List<Constituent> goldMentions = new ArrayList<>();
        ACEReader aceReader = null;
        try {
            aceReader = new ACEReader("data/partition_with_dev/eval/" + i, false);
        } catch (Exception e) {
            e.printStackTrace();
        }
        for (TextAnnotation ta : aceReader) {
            goldMentions.addAll(ta.getView(ViewNames.MENTION_ACE).getConstituents());
        }
        total_mention_labeled += goldMentions.size();
        total_mention_predicted += predictedMentions.size();
        for (Constituent p : predictedMentions) {
            Constituent ph = getPredictedMentionHead(p);
            for (Constituent g : goldMentions) {
                if (!p.getTextAnnotation().getText().equals(g.getTextAnnotation().getText())) {
                    continue;
                }
                Constituent gh = ACEReader.getEntityHeadForConstituent(g, g.getTextAnnotation(), "TESTG");
                try {
                    if (ph.getStartSpan() == gh.getStartSpan() && ph.getEndSpan() == gh.getEndSpan()) {
                        total_mention_head_correct++;
                        if (g.getStartSpan() == p.getStartSpan() && g.getEndSpan() == p.getEndSpan()) {
                            total_mention_extent_correct++;
                        }
                        break;
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }
    System.out.println("Total labeled mention: " + total_mention_labeled);
    System.out.println("Total predicted mention: " + total_mention_predicted);
    System.out.println("Total head correct: " + total_mention_head_correct);
    System.out.println("Total extent correct: " + total_mention_extent_correct);
}
Also used : ACEReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ACEReader) ArrayList(java.util.ArrayList) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Vector(java.util.Vector) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) DatastoreException(org.cogcomp.DatastoreException) JWNLException(net.didion.jwnl.JWNLException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) IOException(java.io.IOException) InvalidPortException(io.minio.errors.InvalidPortException) Learner(edu.illinois.cs.cogcomp.lbjava.learn.Learner) Gazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers) FlatGazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers) Datastore(org.cogcomp.Datastore) BrownClusters(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters) File(java.io.File)

Example 4 with Gazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers in project cogcomp-nlp by CogComp.

the class ExtentTester method testExtentOnGoldHead.

public static void testExtentOnGoldHead() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException {
    int labeled = 0;
    int correct = 0;
    POSAnnotator posAnnotator = null;
    WordNetManager wordNet = null;
    Gazetteers gazetteers = null;
    BrownClusters brownClusters = null;
    try {
        WordNetManager.loadConfigAsClasspathResource(true);
        wordNet = WordNetManager.getInstance();
        posAnnotator = new POSAnnotator();
        Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
        gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
        Vector<String> bcs = new Vector<>();
        bcs.add("brown-clusters" + File.separator + "brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
        bcs.add("brown-clusters" + File.separator + "brownBllipClusters");
        bcs.add("brown-clusters" + File.separator + "brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
        Vector<Integer> bcst = new Vector<>();
        bcst.add(5);
        bcst.add(5);
        bcst.add(5);
        Vector<Boolean> bcsl = new Vector<>();
        bcsl.add(false);
        bcsl.add(false);
        bcsl.add(false);
        brownClusters = BrownClusters.get(bcs, bcst, bcsl);
    } catch (Exception e) {
        e.printStackTrace();
    }
    for (int i = 0; i < 1; i++) {
        ExtentReader train_parser = new ExtentReader("data/partition_with_dev/train/" + i, "COMBINED-ALL-TRAIN-" + i);
        extent_classifier classifier = train_extent_classifier(train_parser);
        BIOCombinedReader bioCombinedReader = null;
        try {
            bioCombinedReader = new BIOCombinedReader(i, "ALL-EVAL", "ALL", true);
        } catch (Exception e) {
            e.printStackTrace();
        }
        for (Object ota = bioCombinedReader.next(); ota != null; ota = bioCombinedReader.next()) {
            TextAnnotation ta = (TextAnnotation) ota;
            try {
                ta.addView(posAnnotator);
            } catch (Exception e) {
                e.printStackTrace();
            }
            String mentionViewName = ViewNames.MENTION_ERE;
            if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
                mentionViewName = ViewNames.MENTION_ACE;
            }
            View mentionView = ta.getView(mentionViewName);
            for (Constituent mention : mentionView.getConstituents()) {
                Constituent head = ACEReader.getEntityHeadForConstituent(mention, ta, "HEADS");
                if (head == null) {
                    continue;
                }
                labeled++;
                Constituent predictedFullMention = getFullMention(classifier, head, gazetteers, brownClusters, wordNet);
                if (predictedFullMention.getStartSpan() == mention.getStartSpan() && predictedFullMention.getEndSpan() == mention.getEndSpan()) {
                    correct++;
                } else {
                    System.out.println("Gold: " + mention.toString());
                    System.out.println("Predicted: " + predictedFullMention.toString());
                }
            }
        }
    }
    System.out.println("Labeled: " + labeled);
    System.out.println("Correct: " + correct);
    System.out.println("Correctness: " + (double) correct * 100.0 / (double) labeled);
}
Also used : POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) View(edu.illinois.cs.cogcomp.core.datastructures.textannotation.View) DatastoreException(org.cogcomp.DatastoreException) JWNLException(net.didion.jwnl.JWNLException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) IOException(java.io.IOException) InvalidPortException(io.minio.errors.InvalidPortException) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) Gazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers) FlatGazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers) Datastore(org.cogcomp.Datastore) BrownClusters(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) File(java.io.File) Vector(java.util.Vector) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 5 with Gazetteers

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers in project cogcomp-nlp by CogComp.

the class BIOCombinedReader method getTokensFromTAs.

private List<Constituent> getTokensFromTAs() throws InvalidPortException, InvalidEndpointException, IOException, JWNLException, DatastoreException {
    List<Constituent> ret = new ArrayList<>();
    WordNetManager wordNet = null;
    Gazetteers gazetteers = null;
    BrownClusters brownClusters = null;
    Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
    File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
    gazetteers = GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
    Vector<String> bcs = new Vector<>();
    bcs.add("brown-clusters/brown-english-wikitext.case-intact.txt-c1000-freq10-v3.txt");
    bcs.add("brown-clusters/brownBllipClusters");
    bcs.add("brown-clusters/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt");
    Vector<Integer> bcst = new Vector<>();
    bcst.add(5);
    bcst.add(5);
    bcst.add(5);
    Vector<Boolean> bcsl = new Vector<>();
    bcsl.add(false);
    bcsl.add(false);
    bcsl.add(false);
    brownClusters = BrownClusters.get(bcs, bcst, bcsl);
    WordNetManager.loadConfigAsClasspathResource(true);
    wordNet = WordNetManager.getInstance();
    for (TextAnnotation ta : currentTas) {
        View tokenView = ta.getView(ViewNames.TOKENS);
        String mentionViewName = "";
        if (ta.getId().startsWith("bn") || ta.getId().startsWith("nw")) {
            mentionViewName = ViewNames.MENTION_ACE;
        } else {
            mentionViewName = ViewNames.MENTION_ERE;
        }
        View mentionView = ta.getView(mentionViewName);
        View bioView = new SpanLabelView("BIO", BIOReader.class.getCanonicalName(), ta, 1.0f);
        String[] token2tags = new String[tokenView.getConstituents().size()];
        for (int i = 0; i < token2tags.length; i++) {
            token2tags[i] = "O";
        }
        for (Constituent c : mentionView.getConstituents()) {
            if (!_type.equals("ALL")) {
                String excludeType = _type;
                if (!c.getAttribute("EntityMentionType").equals(excludeType)) {
                    continue;
                }
            }
            Constituent cHead = ACEReader.getEntityHeadForConstituent(c, ta, "HEAD");
            if (!c.hasAttribute("EntityType")) {
                c.addAttribute("EntityType", c.getLabel());
            }
            if (cHead == null) {
                continue;
            }
            if (c.getAttribute("EntityType").equals("VEH") || c.getAttribute("EntityType").equals("WEA")) {
            // continue;
            }
            c.addAttribute("EntityType", "MENTION");
            /**
             * @Note that unlike BIOReader, the tagging schema is set to "BIOLU" here
             */
            if (cHead.getStartSpan() + 1 == cHead.getEndSpan()) {
                token2tags[cHead.getStartSpan()] = "U-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
            } else {
                token2tags[cHead.getStartSpan()] = "B-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                for (int i = cHead.getStartSpan() + 1; i < cHead.getEndSpan() - 1; i++) {
                    token2tags[i] = "I-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
                }
                token2tags[cHead.getEndSpan() - 1] = "L-" + c.getAttribute("EntityType") + "," + c.getAttribute("EntityMentionType");
            }
        }
        for (int i = 0; i < token2tags.length; i++) {
            Constituent curToken = tokenView.getConstituentsCoveringToken(i).get(0);
            Constituent newToken = curToken.cloneForNewView("BIO");
            if (token2tags[i].equals("O")) {
                newToken.addAttribute("BIO", token2tags[i]);
            } else {
                String[] group = token2tags[i].split(",");
                String tag = group[0];
                String eml = group[1];
                newToken.addAttribute("BIO", tag);
                newToken.addAttribute("EntityMentionType", eml);
            }
            newToken.addAttribute("GAZ", ((FlatGazetteers) gazetteers).annotateConstituent(newToken, false));
            newToken.addAttribute("BC", brownClusters.getPrefixesCombined(newToken.toString()));
            if (!newToken.toString().contains("http")) {
                newToken.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordNet, newToken));
                newToken.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordNet, newToken));
            } else {
                newToken.addAttribute("WORDNETTAG", ",");
                newToken.addAttribute("WORDNETHYM", ",");
            }
            if (_mode.contains("TRAIN")) {
                newToken.addAttribute("isTraining", "true");
            } else {
                newToken.addAttribute("isTraining", "false");
            }
            bioView.addConstituent(newToken);
        }
        ta.addView("BIO", bioView);
        for (Constituent c : bioView) {
            ret.add(c);
        }
    }
    return ret;
}
Also used : ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) WordNetManager(edu.illinois.cs.cogcomp.edison.utilities.WordNetManager) Gazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers) FlatGazetteers(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers) Datastore(org.cogcomp.Datastore) BrownClusters(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters)

Aggregations

ResourceConfigurator (edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator)5 WordNetManager (edu.illinois.cs.cogcomp.edison.utilities.WordNetManager)5 BrownClusters (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.BrownClusters)5 Gazetteers (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers)5 Datastore (org.cogcomp.Datastore)5 FlatGazetteers (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.FlatGazetteers)4 File (java.io.File)4 Vector (java.util.Vector)4 InvalidEndpointException (io.minio.errors.InvalidEndpointException)3 InvalidPortException (io.minio.errors.InvalidPortException)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 JWNLException (net.didion.jwnl.JWNLException)3 DatastoreException (org.cogcomp.DatastoreException)3 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)2 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)2 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)1 Learner (edu.illinois.cs.cogcomp.lbjava.learn.Learner)1 ACEReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ACEReader)1 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)1