Search in sources :

Example 1 with OccurrenceCounter

use of edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter in project cogcomp-nlp by CogComp.

the class BuildEvaluationFiles method appendToEvaluationFile.

public static void appendToEvaluationFile(String goldFile, String taggedFile, OutFile outPhrase, OutFile outToken) {
    ParametersForLbjCode cp = new ParametersForLbjCode();
    Vector<String> goldTags = new Vector<>();
    Vector<String> goldWords = new Vector<>();
    BracketFileReader.parseBracketsAnnotatedText(goldFile, goldTags, goldWords, cp);
    Vector<String> tempgoldTags = new Vector<>();
    Vector<String> tempgoldWords = new Vector<>();
    Hashtable<Integer, Boolean> newlines = new Hashtable<>();
    for (int i = 0; i < goldWords.size(); i++) {
        String s = cleanPunctuation(goldWords.elementAt(i));
        // newlines.put(tempgoldTags.size(),true);
        if (s.length() > 0) {
            tempgoldWords.addElement(s);
            tempgoldTags.addElement(goldTags.elementAt(i));
        }
    }
    goldWords = tempgoldWords;
    goldTags = tempgoldTags;
    Vector<String> resTags = new Vector<>();
    Vector<String> resWords = new Vector<>();
    BracketFileReader.parseBracketsAnnotatedText(taggedFile, resTags, resWords, cp);
    Vector<String> tempresTags = new Vector<>();
    Vector<String> tempresWords = new Vector<>();
    for (int i = 0; i < resWords.size(); i++) {
        String s = cleanPunctuation(resWords.elementAt(i));
        if (s.length() > 0) {
            tempresWords.addElement(s);
            tempresTags.addElement(resTags.elementAt(i));
        }
    }
    resWords = tempresWords;
    resTags = tempresTags;
    int gWordId = 0, gCharId = 0;
    int tWordId = 0, tCharId = 0;
    while (gWordId < goldWords.size()) {
        String gw = goldWords.elementAt(gWordId).toLowerCase();
        String rw = resWords.elementAt(tWordId).toLowerCase();
        OccurrenceCounter resTagsForCurrentToken = new OccurrenceCounter();
        while (gCharId < gw.length()) {
            if (tCharId >= rw.length()) {
                tWordId++;
                tCharId = 0;
                rw = resWords.elementAt(tWordId).toLowerCase();
            }
            if (gw.charAt(gCharId) != rw.charAt(tCharId)) {
                logger.warn("mismatched characters when building evaluation files");
                logger.warn("the words were '" + gw + "' and: '" + rw + "'  exiting");
                logger.warn("the characters were '" + gw.charAt(gCharId) + "' and: '" + rw.charAt(tCharId) + "'  exiting");
                outToken.close();
                outPhrase.close();
                System.exit(0);
            } else {
                if (gCharId == 0) {
                    resTagsForCurrentToken.addToken(resTags.elementAt(tWordId));
                }
            // String lastTag=resTags.elementAt(tWordId);
            // resTagsForCurrentToken.addToken(lastTag);
            // logger.info(gw.charAt(gCharId)+"-"+rw.charAt(tCharId));
            }
            gCharId++;
            tCharId++;
        }
        String maxLabel = "";
        int maxCount = 0;
        for (Iterator<String> iter = resTagsForCurrentToken.getTokensIterator(); iter.hasNext(); ) {
            String s = iter.next();
            if (maxCount <= resTagsForCurrentToken.getCount(s)) {
                maxCount = (int) resTagsForCurrentToken.getCount(s);
                maxLabel = s;
            }
        }
        // if((maxLabel.indexOf("-")>-1)&&(goldTags.elementAt(gWordId).indexOf("-")>-1)
        // &&(maxLabel.substring(2)).equalsIgnoreCase(goldTags.elementAt(gWordId).substring(2)))
        // outPhrase.println(goldWords.elementAt(gWordId)+" "+goldTags.elementAt(gWordId)+" "+goldTags.elementAt(gWordId));
        // else
        outPhrase.println(goldWords.elementAt(gWordId) + " " + goldTags.elementAt(gWordId) + " " + maxLabel);
        String g = goldTags.elementAt(gWordId);
        if (g.indexOf('-') > -1)
            g = g.substring(g.indexOf('-') + 1);
        if (maxLabel.indexOf('-') > -1)
            maxLabel = maxLabel.substring(maxLabel.indexOf('-') + 1);
        outToken.println(goldWords.elementAt(gWordId) + " " + g + " " + maxLabel);
        if (newlines.containsKey(gWordId)) {
            outPhrase.println("");
            outToken.println("");
        }
        gWordId++;
        gCharId = 0;
    /*
             * tCharId++; if(tCharId>=rw.length()){ tWordId++; tCharId=0;
             * if(tWordId<resWords.size()) rw=resWords.elementAt(tWordId).toLowerCase(); }
             */
    }
}
Also used : Hashtable(java.util.Hashtable) Vector(java.util.Vector) OccurrenceCounter(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter) ParametersForLbjCode(edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode)

Example 2 with OccurrenceCounter

use of edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter in project cogcomp-nlp by CogComp.

the class TwoLayerPredictionAggregationFeatures method setLevel1AggregationFeatures.

/*
     * If our confidence in predicting the named entity is higher than minConfidenceThreshold, we're
     * going to use the predictions as features
     */
private static void setLevel1AggregationFeatures(NEWord word, boolean useGoldData) {
    ParametersForLbjCode parameters = word.params;
    // this used to be hard-coded to 0.1
    double omissionRate = parameters.omissionRate;
    // this used to be hard-coded to 0.2 for right direction and 0.1 for left
    // now this is approximated by halving the rate set in the properties
    double noiseRate = parameters.randomNoiseLevel;
    String wordForm = word.form;
    String wordFormLC = wordForm.toLowerCase();
    word.resetLevel1AggregationFeatures();
    NamedEntity currentNE = word.predictedEntity;
    // these counters will keep the distribution of the features around the current word
    OccurrenceCounter featuresCounts = new OccurrenceCounter();
    if (useGoldData)
        currentNE = word.goldEntity;
    HashMap<NamedEntity, Boolean> confidentEntitiesInTheArea = new HashMap<>();
    HashMap<NamedEntity, Boolean> confidentEntitiesInTheAreaLeft = new HashMap<>();
    HashMap<NamedEntity, Boolean> confidentEntitiesInTheAreaRight = new HashMap<>();
    NEWord w = word.previousIgnoreSentenceBoundary;
    for (int i = 0; i < 1000 && w != null; i++) {
        if (useGoldData && w.goldEntity != null && (!w.goldEntity.equals(currentNE))) {
            confidentEntitiesInTheArea.put(w.goldEntity, true);
            confidentEntitiesInTheAreaLeft.put(w.goldEntity, true);
        }
        if (w.predictedEntity != null && (!w.predictedEntity.equals(currentNE)) && !useGoldData) {
            confidentEntitiesInTheArea.put(w.predictedEntity, true);
            confidentEntitiesInTheAreaLeft.put(w.predictedEntity, true);
        }
        if (w != word && w.form.equals(wordForm)) {
            if (useGoldData) {
                // we're typically better with entities to the left....
                if (parameters.level1AggregationRandomGenerator.nextDouble() < (noiseRate / 2))
                    featuresCounts.addToken("leftTokenLevel" + parameters.level1AggregationRandomGenerator.randomLabel());
                else
                    featuresCounts.addToken("leftTokenLevel" + w.neLabel);
            } else {
                featuresCounts.addToken("leftTokenLevel" + w.neTypeLevel1);
            }
        }
        w = w.previousIgnoreSentenceBoundary;
    }
    w = word.nextIgnoreSentenceBoundary;
    for (int i = 0; i < 1000 && w != null; i++) {
        if (useGoldData && w.goldEntity != null && (!w.goldEntity.equals(currentNE))) {
            confidentEntitiesInTheArea.put(w.goldEntity, true);
            confidentEntitiesInTheAreaRight.put(w.goldEntity, true);
        }
        if (w.predictedEntity != null && (!w.predictedEntity.equals(currentNE)) && !useGoldData) {
            confidentEntitiesInTheArea.put(w.predictedEntity, true);
            confidentEntitiesInTheAreaRight.put(w.predictedEntity, true);
        }
        if (w != word && w.form.equals(wordForm)) {
            if (useGoldData) {
                if (parameters.level1AggregationRandomGenerator.nextDouble() < noiseRate)
                    featuresCounts.addToken("rightTokenLevel" + parameters.level1AggregationRandomGenerator.randomLabel());
                else
                    featuresCounts.addToken("rightTokenLevel" + w.neLabel);
            } else {
                featuresCounts.addToken("rightTokenLevel" + w.neTypeLevel1);
            }
        }
        w = w.nextIgnoreSentenceBoundary;
    }
    for (NamedEntity ne : confidentEntitiesInTheArea.keySet()) {
        String neForm = ne.form;
        String neFormLC = neForm.toLowerCase();
        // check if we should just omit this NE
        if (parameters.level1AggregationRandomGenerator.nextDouble() > omissionRate) {
            // this is if the direction is right. If the direction is left- we have to modify
            // this
            String direction = Direction.RIGHT.toString();
            // please be careful with updating the direction values
            if (confidentEntitiesInTheAreaLeft.containsKey(ne)) {
                direction = Direction.LEFT.toString();
                // we're typically better with entities to the left....
                noiseRate = omissionRate / 2;
            }
            String neType = ne.type;
            if (parameters.level1AggregationRandomGenerator.nextDouble() < noiseRate) {
                String randomLabelType = parameters.level1AggregationRandomGenerator.randomType();
                while (randomLabelType.equalsIgnoreCase("O") || randomLabelType.equals(neType)) randomLabelType = parameters.level1AggregationRandomGenerator.randomType();
                neType = randomLabelType;
            }
            if ((!confidentEntitiesInTheAreaLeft.containsKey(ne)) && (!confidentEntitiesInTheAreaRight.containsKey(ne)))
                throw new IllegalArgumentException("Fatal error: the NE is neither on the left or the right?!");
            boolean neEqWord = neForm.equals(wordForm);
            boolean neEqWordLC = neFormLC.equals(wordFormLC);
            boolean neStartsWithWord = neForm.startsWith(wordForm);
            boolean neStartsWithWordLC = neFormLC.startsWith(wordFormLC);
            boolean neEndsWithWord = neForm.endsWith(wordForm);
            boolean neEndsWithWordLC = neFormLC.endsWith(wordFormLC);
            boolean neContainsWord = neForm.contains(wordForm);
            if (currentNE != null) {
                String curNEForm = currentNE.form;
                String curNEFormLC = curNEForm.toLowerCase();
                if (curNEForm.length() > 3) {
                    boolean neEqCurNE = neForm.equals(curNEForm);
                    boolean neEqCurNELC = neFormLC.equals(curNEFormLC);
                    boolean neStartsWithCurNE = neForm.startsWith(curNEForm);
                    boolean neStartsWithCurNELC = neFormLC.startsWith(curNEFormLC);
                    boolean neEndsWithCurNE = neForm.endsWith(curNEForm);
                    boolean neEndsWithCurNELC = neFormLC.endsWith(curNEFormLC);
                    boolean neContainsCurNE = neForm.contains(curNEForm);
                    if (neEqCurNE)
                        featuresCounts.addToken(direction + "NE_Also_Exact_Match_NE_Type:\t" + neType);
                    if ((!neEqCurNE) && (!neStartsWithCurNE) && (!neEndsWithCurNE) && neContainsCurNE)
                        featuresCounts.addToken(direction + "NE_Also_Substring_In_NE_Type:\t" + neType);
                    if ((!neEqCurNE) && neStartsWithCurNE)
                        featuresCounts.addToken(direction + "NE_Also_Starts_NE_Type:\t" + neType);
                    if ((!neEqCurNE) && neEndsWithCurNE)
                        featuresCounts.addToken(direction + "NE_Also_Ends_NE_Type:\t" + neType);
                    if ((!neEqCurNE) && neEqCurNELC)
                        featuresCounts.addToken(direction + "NE_Also_Exact_Match_NE_Type_IC:\t" + neType);
                    if ((!((!neEqCurNE) && (!neStartsWithCurNE) && (!neEndsWithCurNE) && neContainsCurNE)) && ((!neEqCurNELC) && (!neStartsWithCurNELC) && (!neEndsWithCurNELC) && neFormLC.contains(curNEFormLC)))
                        featuresCounts.addToken(direction + "NE_Also_Substring_In_NE_Type_IC:\t" + neType);
                    if ((!((!neEqCurNE) && neStartsWithCurNE)) && (!neEqCurNELC) && neStartsWithCurNELC)
                        featuresCounts.addToken(direction + "NE_Also_Starts_NE_Type_IC:\t" + neType);
                    if ((!((!neEqCurNE) && neEndsWithCurNE)) && (!neEqCurNELC) && neEndsWithCurNELC)
                        featuresCounts.addToken(direction + "NE_Also_Ends_NE_Type_IC:\t" + neType);
                }
                // able to say something about the word "Bank"
                if (wordForm.length() > 3) {
                    if (neEqWord)
                        featuresCounts.addToken(direction + "labeledTokenExactMatchInExpression:\t" + neType);
                    if ((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)
                        featuresCounts.addToken(direction + "labeledTokenSubstringInExpression:\t" + neType);
                    if ((!neEqWord) && neStartsWithWord)
                        featuresCounts.addToken(direction + "labeledTokenStartsExpression:\t" + neType);
                    if ((!neEqWord) && neEndsWithWord)
                        featuresCounts.addToken(direction + "unlabeledTokenEndsExpression:\t" + neType);
                    if ((!neEqWord) && neEqWordLC)
                        featuresCounts.addToken(direction + "labeledTokenExactMatchInExpression_IC:\t" + neType);
                    if ((!((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)) && ((!neEqWordLC) && (!neStartsWithWordLC) && (!neEndsWithWordLC) && neFormLC.contains(wordFormLC)))
                        featuresCounts.addToken(direction + "labeledTokenSubstringInExpression_IC:\t" + neType);
                    if ((!((!neEqWord) && neStartsWithWord)) && ((!neEqWordLC) && neStartsWithWordLC))
                        featuresCounts.addToken(direction + "labeledTokenStartsExpression_IC:\t" + neType);
                    if ((!((!neEqWord) && neEndsWithWord)) && ((!neEqWordLC) && neEndsWithWordLC))
                        featuresCounts.addToken(direction + "labeledTokenEndsExpression_IC:\t" + neType);
                }
            } else {
                // this form is not a part of named entity
                if (wordForm.length() > 3) {
                    if (neEqWord)
                        featuresCounts.addToken(direction + "unlabeledTokenExactMatchInExpression:\t" + neType);
                    if ((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)
                        featuresCounts.addToken(direction + "unlabeledTokenSubstringInExpression:\t" + neType);
                    if ((!neEqWord) && neStartsWithWord)
                        featuresCounts.addToken(direction + "unlabeledTokenStartsExpression:\t" + neType);
                    if ((!neEqWord) && neEndsWithWord)
                        featuresCounts.addToken(direction + "unlabeledTokenEndsExpression:\t" + neType);
                    if ((!neEqWord) && neEqWordLC)
                        featuresCounts.addToken(direction + "unlabeledTokenExactMatchInExpression_IC:\t" + neType);
                    if ((!((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)) && ((!neEqWordLC) && (!neStartsWithWordLC) && (!neEndsWithWordLC) && neFormLC.contains(wordFormLC)))
                        featuresCounts.addToken(direction + "unlabeledTokenSubstringInExpression_IC:\t" + neType);
                    if ((!((!neEqWord) && neStartsWithWord)) && ((!neEqWordLC) && neStartsWithWordLC))
                        featuresCounts.addToken(direction + "unlabeledTokenStartsExpression_IC:\t" + neType);
                    if ((!((!neEqWord) && neEndsWithWord)) && ((!neEqWordLC) && neEndsWithWordLC))
                        featuresCounts.addToken(direction + "unlabeledTokenEndsExpression_IC:\t" + neType);
                }
            }
        }
    }
    double max = -1;
    for (Iterator<String> i = featuresCounts.getTokensIterator(); i.hasNext(); ) {
        String s = i.next();
        if (max < featuresCounts.getCount(s))
            max = featuresCounts.getCount(s);
    }
    if (max == 0)
        max = 1;
    ArrayList<NEWord.RealFeature> newag = word.resetLevel1AggregationFeatures();
    for (Iterator<String> i = featuresCounts.getTokensIterator(); i.hasNext(); ) {
        String s = i.next();
        newag.add(new NEWord.RealFeature(featuresCounts.getCount(s) / max, s));
    }
}
Also used : HashMap(java.util.HashMap) OccurrenceCounter(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord) NamedEntity(edu.illinois.cs.cogcomp.ner.LbjTagger.NamedEntity) ParametersForLbjCode(edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode)

Example 3 with OccurrenceCounter

use of edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter in project cogcomp-nlp by CogComp.

the class MemoryEfficientNB method toBeKept.

public static boolean toBeKept(Vector<String> tokens, Hashtable<String, Integer> coolWords, double minRatio, int minLen) {
    OccurrenceCounter counter = new OccurrenceCounter();
    Hashtable<String, Boolean> passed = new Hashtable<>(tokens.size() * 2);
    for (int i = 0; i < tokens.size(); i++) {
        String s = tokens.elementAt(i);
        counter.addToken(s);
        if ((coolWords.containsKey(s)) && (!passed.containsKey(s)))
            passed.put(s, true);
    }
    return ((tokens.size() >= minLen) && (((double) passed.size()) / ((double) counter.uniqueTokens) >= minRatio));
}
Also used : Hashtable(java.util.Hashtable) OccurrenceCounter(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter)

Aggregations

OccurrenceCounter (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter)3 ParametersForLbjCode (edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode)2 Hashtable (java.util.Hashtable)2 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)1 NamedEntity (edu.illinois.cs.cogcomp.ner.LbjTagger.NamedEntity)1 HashMap (java.util.HashMap)1 Vector (java.util.Vector)1