Search in sources :

Example 1 with CharacteristicWords

use of edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords in project cogcomp-nlp by CogComp.

the class TitleTextNormalizer method normalizeCase.

public static void normalizeCase(Data data) {
    if (!ParametersForLbjCode.currentParameters.normalizeTitleText)
        return;
    if (lowercasedToNormalizedTokensMap == null)
        init();
    // Below are the words that we'll want to normalize. We'll fill in the hashtable below with
    // the
    // words that appear in non-mixed case sentences. For CoNLL data, we basically fill the
    // hashmap
    // below with words from the titles
    HashMap<NEWord, Boolean> wordsToNormalize = new HashMap<>();
    HashMap<NEWord, Boolean> wordsInMixedCaseSentences = new HashMap<>();
    for (int docid = 0; docid < data.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
        for (LinkedVector sentence : sentences) {
            if (mixedCase(sentence)) {
                // note that I exclude here the first word of a sentence on purpose!!!
                for (int j = 1; j < sentence.size(); j++) wordsInMixedCaseSentences.put((NEWord) sentence.get(j), true);
            } else {
                // normalization
                for (int j = 0; j < sentence.size(); j++) wordsToNormalize.put(((NEWord) sentence.get(j)), true);
            }
        }
    }
    for (NEWord w : wordsToNormalize.keySet()) {
        w.isCaseNormalized = true;
        if (w.form.equals("A")) {
            w.normalizedForm = "a";
            w.form = w.normalizedForm;
        } else {
            // the hashmap below remembers the words that appeared lowercased in the document
            HashMap<String, Boolean> lowecasedForms = new HashMap<>();
            // note that this MUST EXCLUDE the words that start a sentence!!!!
            // for each mixed-case string in mixed-case sentences, such as "McLaren"
            // we're keeping all the ways to write them out. E.g. McLaren MCLAREN etc.
            // Eventually, we'll normalize to the most common spelling in the document
            HashMap<String, CharacteristicWords> uppercasedFormsInMixedCaseNonSentenceStart = new HashMap<>();
            getNeighborhoodWordStatistics(w, wordsInMixedCaseSentences, uppercasedFormsInMixedCaseNonSentenceStart, lowecasedForms);
            // w.originalForm=w.form; // this can cauze all sorts of problems!!!
            String key = w.form.toLowerCase();
            if (w.normalizedMostLinkableExpression == null) {
                if (lowecasedForms.containsKey(key)) {
                    w.normalizedForm = key;
                } else {
                    if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
                        w.normalizedForm = uppercasedFormsInMixedCaseNonSentenceStart.get(key).topWords.elementAt(0);
                    else {
                        if (lowercasedToNormalizedTokensMap.containsKey(w.form.toLowerCase()))
                            w.normalizedForm = lowercasedToNormalizedTokensMap.get(w.form.toLowerCase());
                        else
                            // .toLowerCase();
                            w.normalizedForm = w.form;
                    }
                }
            } else {
                int start = w.normalizedMostLinkableExpression.toLowerCase().indexOf(w.form.toLowerCase());
                String normalizedForm = w.normalizedMostLinkableExpression.substring(start, start + w.form.length());
                if (Character.isLowerCase(normalizedForm.charAt(0)) && uppercasedFormsInMixedCaseNonSentenceStart.containsKey(normalizedForm.toLowerCase()))
                    w.normalizedForm = uppercasedFormsInMixedCaseNonSentenceStart.get(normalizedForm.toLowerCase()).topWords.elementAt(0);
                else
                    w.normalizedForm = normalizedForm;
            }
            if (w.previous == null && Character.isLowerCase(w.normalizedForm.charAt(0)))
                w.normalizedForm = Character.toUpperCase(w.normalizedForm.charAt(0)) + w.normalizedForm.substring(1);
            w.form = w.normalizedForm;
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) HashMap(java.util.HashMap) CharacteristicWords(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 2 with CharacteristicWords

use of edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords in project cogcomp-nlp by CogComp.

the class PredictionsToProbabilities method getAndSetPredictionConfidences.

public static CharacteristicWords getAndSetPredictionConfidences(SparseNetworkLearner c, NEWord w, NEWord.LabelToLookAt predictionType) {
    if (null == c) {
        logger.error("ERROR: PredictionsToProbabilities.CharacteristicWords(): null learner.");
    }
    Score[] scores = c.scores(w).toArray();
    if (logger.isDebugEnabled()) {
        logger.debug("## {}.getAndSetPredictionConfidences(): c.scores: {}", NAME, c.scores(w));
    }
    double[] correctedScores = new double[scores.length];
    double min = scores[0].score;
    int maxScoreIdx = 0;
    double maxScore = scores[maxScoreIdx].score;
    String maxLabel = scores[maxScoreIdx].value;
    for (int i = 0; i < scores.length; i++) {
        if (min > scores[i].score)
            min = scores[i].score;
        if (maxScore < scores[i].score) {
            maxScore = scores[i].score;
            maxScoreIdx = i;
            maxLabel = scores[i].value;
        }
    }
    for (int i = 0; i < scores.length; i++) correctedScores[i] = scores[i].score - min;
    double sum = 0;
    for (int i = 0; i < correctedScores.length; i++) {
        correctedScores[i] = Math.exp(correctedScores[i]);
        sum += correctedScores[i];
    }
    if (sum > 0) {
        for (int i = 0; i < correctedScores.length; i++) correctedScores[i] /= sum;
    }
    for (int i = 0; i < correctedScores.length; i++) correctedScores[i] = correctedScores[i];
    CharacteristicWords res = new CharacteristicWords(scores.length);
    for (int i = 0; i < scores.length; i++) res.addElement(scores[i].value, correctedScores[i]);
    if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel1Tagger)) {
        w.neTypeLevel1 = maxLabel;
        w.predictionConfidencesLevel1Classifier = res;
    }
    if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel2Tagger)) {
        w.neTypeLevel2 = maxLabel;
        w.predictionConfidencesLevel2Classifier = res;
    }
    return res;
}
Also used : Score(edu.illinois.cs.cogcomp.lbjava.classify.Score) CharacteristicWords(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords)

Example 3 with CharacteristicWords

use of edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords in project cogcomp-nlp by CogComp.

the class TitleTextNormalizer method getNeighborhoodWordStatistics.

/*
     * the first 2 parameters must be passed. the last 2 places is where I'm keeping the answers
     */
public static void getNeighborhoodWordStatistics(NEWord word, HashMap<NEWord, Boolean> wordsInMixedCasedSentences, HashMap<String, CharacteristicWords> uppercasedFormsInMixedCaseNonSentenceStart, HashMap<String, Boolean> lowecasedForms) {
    NEWord temp = word.previousIgnoreSentenceBoundary;
    int count = 0;
    while (temp != null && count < WindowSize) {
        // we dont want to take into statistics words that begin sentences
        if (wordsInMixedCasedSentences.containsKey(temp) && temp.previous != null) {
            String w = temp.form;
            String key = w.toLowerCase();
            if (Character.isUpperCase(w.charAt(0))) {
                CharacteristicWords topSpellings = new CharacteristicWords(5);
                if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
                    topSpellings = uppercasedFormsInMixedCaseNonSentenceStart.get(key);
                topSpellings.addElement(w, 1);
                uppercasedFormsInMixedCaseNonSentenceStart.put(key, topSpellings);
            }
            if (Character.isLowerCase(w.charAt(0)))
                lowecasedForms.put(key, true);
        }
        count++;
        temp = temp.previousIgnoreSentenceBoundary;
    }
    temp = word.nextIgnoreSentenceBoundary;
    count = 0;
    while (temp != null && count < WindowSize) {
        // we dont want to take into statistics words that begin sentences
        if (wordsInMixedCasedSentences.containsKey(temp) && temp.previous != null) {
            String w = temp.form;
            String key = w.toLowerCase();
            if (Character.isUpperCase(w.charAt(0))) {
                CharacteristicWords topSpellings = new CharacteristicWords(5);
                if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
                    topSpellings = uppercasedFormsInMixedCaseNonSentenceStart.get(key);
                topSpellings.addElement(w, 1);
                uppercasedFormsInMixedCaseNonSentenceStart.put(key, topSpellings);
            }
            if (Character.isLowerCase(w.charAt(0)))
                lowecasedForms.put(key, true);
        }
        count++;
        temp = temp.nextIgnoreSentenceBoundary;
    }
}
Also used : CharacteristicWords(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 4 with CharacteristicWords

use of edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords in project cogcomp-nlp by CogComp.

the class MemoryEfficientNB method getTopPmiWords.

/*
     * score(w)=max{P(w,c)/(P(w)P(c))=max{P(w|c)/P(w)}
     */
public Hashtable<String, Integer> getTopPmiWords(int maxWordsPerClass, double confThres, int minAppThres) {
    Hashtable<String, Integer> coolWords = new Hashtable<>();
    for (int i = 0; i < classesN; i++) {
        CharacteristicWords words = this.getTopPmiWords(i, maxWordsPerClass, confThres, minAppThres);
        logger.info(words.toString());
        for (int j = 0; j < words.topWords.size(); j++) if (!coolWords.containsKey(words.topWords.elementAt(j)))
            coolWords.put(words.topWords.elementAt(j), 1);
    }
    return coolWords;
}
Also used : Hashtable(java.util.Hashtable) CharacteristicWords(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords)

Aggregations

CharacteristicWords (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords)4 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)2 Score (edu.illinois.cs.cogcomp.lbjava.classify.Score)1 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)1 HashMap (java.util.HashMap)1 Hashtable (java.util.Hashtable)1