Search in sources :

Example 1 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class Decoder method annotateBIO_AllLevelsWithTaggers.

/**
     * use taggerLevel2=null if you want to use only one level of inference
     */
protected static void annotateBIO_AllLevelsWithTaggers(Data data, NETaggerLevel1 taggerLevel1, NETaggerLevel2 taggerLevel2) throws Exception {
    clearPredictions(data);
    NETaggerLevel1.isTraining = false;
    NETaggerLevel2.isTraining = false;
    GreedyDecoding.annotateGreedy(data, taggerLevel1, 1);
    TextChunkRepresentationManager.changeChunkRepresentation(ParametersForLbjCode.currentParameters.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel1Tagger);
    PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel1, NEWord.LabelToLookAt.PredictionLevel1Tagger);
    // this block runs the level2 tagger
    // Previously checked if features included 'PatternFeatures'
    boolean level2 = ParametersForLbjCode.currentParameters.featuresToUse.containsKey("PredictionsLevel1");
    if (taggerLevel2 != null && level2) {
        // annotate with patterns
        PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, 0.0, NEWord.LabelToLookAt.PredictionLevel1Tagger);
        TwoLayerPredictionAggregationFeatures.setLevel1AggregationFeatures(data, false);
        GreedyDecoding.annotateGreedy(data, taggerLevel2, 2);
        PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel2, NEWord.LabelToLookAt.PredictionLevel2Tagger);
        TextChunkRepresentationManager.changeChunkRepresentation(ParametersForLbjCode.currentParameters.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel2Tagger);
    } else {
        for (int docid = 0; docid < data.documents.size(); docid++) {
            ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
            for (LinkedVector sentence : sentences) for (int i = 0; i < sentence.size(); i++) {
                NEWord w = (NEWord) sentence.get(i);
                w.neTypeLevel2 = w.neTypeLevel1;
            }
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 2 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class TaggedDataReader method readFolder.

public static Vector<NERDocument> readFolder(String path, String format) throws Exception {
    Vector<NERDocument> res = new Vector<>();
    String[] files = (new File(path)).list();
    // sort the files so we can get deterministic order.
    if (ParametersForLbjCode.currentParameters.sortLexicallyFilesInFolders) {
        Arrays.sort(files);
    }
    for (String file1 : files) {
        String file = path + "/" + file1;
        if ((new File(file)).isFile() && (!file1.equals(".DS_Store"))) {
            res.addElement(readFile(file, format, file1));
        }
    }
    if (ParametersForLbjCode.currentParameters.treatAllFilesInFolderAsOneBigDocument) {
        // connecting sentence boundaries
        for (int i = 0; i < res.size() - 1; i++) {
            ArrayList<LinkedVector> ss1 = res.elementAt(i).sentences;
            ArrayList<LinkedVector> ss2 = res.elementAt(i + 1).sentences;
            if (ss1.size() > 0 && ss1.get(ss1.size() - 1).size() > 0 && ss2.size() > 0 && ss2.get(0).size() > 0) {
                NEWord lastWord1 = (NEWord) ss1.get(ss1.size() - 1).get(ss1.get(ss1.size() - 1).size() - 1);
                NEWord firstWord2 = (NEWord) ss2.get(0).get(0);
                lastWord1.nextIgnoreSentenceBoundary = firstWord2;
                firstWord2.previousIgnoreSentenceBoundary = lastWord1;
            }
        }
    }
    return res;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) File(java.io.File) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 3 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class FlatGazetteers method annotate.

public void annotate(NEWord w) {
    if (w.gazetteers == null)
        w.gazetteers = new ArrayList<>();
    NEWord endWord = (NEWord) (w.next);
    String expression = w.form;
    boolean changeEnd = true;
    // default inactive
    if (w.normalizedMostLinkableExpression != null) {
        if (w.gazetteers == null)
            w.gazetteers = new ArrayList<>();
        for (int j = 0; j < dictionaries.size(); j++) if (dictionaries.get(j).contains(w.normalizedMostLinkableExpression)) {
            if (w.normalizedForm != null && !w.normalizedForm.equalsIgnoreCase(w.form))
                w.gazetteers.add("Normalized_Expression_Gaz_Match(*)" + dictNames.get(j));
            else
                w.gazetteers.add("Normalized_Expression_Gaz_Match" + dictNames.get(j));
        }
    }
    for (int i = 0; i < 5 && changeEnd; i++) {
        changeEnd = false;
        for (int j = 0; j < dictionaries.size(); j++) {
            if (dictionaries.get(j).contains(expression)) {
                NEWord temp = w;
                if (temp.gazetteers == null)
                    temp.gazetteers = new ArrayList<>();
                if (i == 0) {
                    temp.gazetteers.add("U-" + dictNames.get(j));
                } else {
                    int loc = 0;
                    while (temp != endWord) {
                        if (temp.gazetteers == null) {
                            temp.gazetteers = new ArrayList<>();
                        }
                        if (loc == 0) {
                            temp.gazetteers.add("B-" + dictNames.get(j));
                        }
                        if (loc > 0 && loc < i) {
                            temp.gazetteers.add("I-" + dictNames.get(j));
                        }
                        if (loc == i) {
                            temp.gazetteers.add("L-" + dictNames.get(j));
                        }
                        temp = (NEWord) temp.next;
                        loc++;
                    }
                }
            }
            if (dictionariesIgnoreCase.get(j).contains(expression.toLowerCase())) {
                NEWord temp = w;
                if (temp.gazetteers == null)
                    temp.gazetteers = new ArrayList<>();
                if (i == 0) {
                    temp.gazetteers.add("U-" + dictNames.get(j) + "(IC)");
                } else {
                    int loc = 0;
                    while (temp != endWord) {
                        if (temp.gazetteers == null) {
                            temp.gazetteers = new ArrayList<>();
                        }
                        if (loc == 0) {
                            temp.gazetteers.add("B-" + dictNames.get(j) + "(IC)");
                        }
                        if (loc > 0 && loc < i) {
                            temp.gazetteers.add("I-" + dictNames.get(j) + "(IC)");
                        }
                        if (loc == i) {
                            temp.gazetteers.add("L-" + dictNames.get(j) + "(IC)");
                        }
                        temp = (NEWord) temp.next;
                        loc++;
                    }
                }
            }
        }
        // dictionaries
        if (endWord != null) {
            expression += " " + endWord.form;
            endWord = (NEWord) endWord.next;
            changeEnd = true;
        }
    }
// i
}
Also used : ArrayList(java.util.ArrayList) MyString(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.MyString) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 4 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class TitleTextNormalizer method normalizeCase.

public static void normalizeCase(Data data) {
    if (lowercasedToNormalizedTokensMap == null)
        init();
    // Below are the words that we'll want to normalize. We'll fill in the hashtable below with
    // the
    // words that appear in non-mixed case sentences. For CoNLL data, we basically fill the
    // hashmap
    // below with words from the titles
    HashMap<NEWord, Boolean> wordsToNormalize = new HashMap<>();
    HashMap<NEWord, Boolean> wordsInMixedCaseSentences = new HashMap<>();
    for (int docid = 0; docid < data.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
        for (LinkedVector sentence : sentences) {
            if (mixedCase(sentence)) {
                // note that I exclude here the first word of a sentence on purpose!!!
                for (int j = 1; j < sentence.size(); j++) wordsInMixedCaseSentences.put((NEWord) sentence.get(j), true);
            } else {
                // normalization
                for (int j = 0; j < sentence.size(); j++) wordsToNormalize.put(((NEWord) sentence.get(j)), true);
            }
        }
    }
    for (NEWord w : wordsToNormalize.keySet()) {
        w.isCaseNormalized = true;
        if (w.form.equals("A")) {
            w.normalizedForm = "a";
            w.form = w.normalizedForm;
        } else {
            // the hashmap below remembers the words that appeared lowercased in the document
            HashMap<String, Boolean> lowecasedForms = new HashMap<>();
            // note that this MUST EXCLUDE the words that start a sentence!!!!
            // for each mixed-case string in mixed-case sentences, such as "McLaren"
            // we're keeping all the ways to write them out. E.g. McLaren MCLAREN etc.
            // Eventually, we'll normalize to the most common spelling in the document
            HashMap<String, CharacteristicWords> uppercasedFormsInMixedCaseNonSentenceStart = new HashMap<>();
            getNeighborhoodWordStatistics(w, wordsInMixedCaseSentences, uppercasedFormsInMixedCaseNonSentenceStart, lowecasedForms);
            // w.originalForm=w.form; // this can cauze all sorts of problems!!!
            String key = w.form.toLowerCase();
            if (w.normalizedMostLinkableExpression == null) {
                if (lowecasedForms.containsKey(key)) {
                    w.normalizedForm = key;
                } else {
                    if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
                        w.normalizedForm = uppercasedFormsInMixedCaseNonSentenceStart.get(key).topWords.elementAt(0);
                    else {
                        if (lowercasedToNormalizedTokensMap.containsKey(w.form.toLowerCase()))
                            w.normalizedForm = lowercasedToNormalizedTokensMap.get(w.form.toLowerCase());
                        else
                            // .toLowerCase();
                            w.normalizedForm = w.form;
                    }
                }
            } else {
                int start = w.normalizedMostLinkableExpression.toLowerCase().indexOf(w.form.toLowerCase());
                String normalizedForm = w.normalizedMostLinkableExpression.substring(start, start + w.form.length());
                if (Character.isLowerCase(normalizedForm.charAt(0)) && uppercasedFormsInMixedCaseNonSentenceStart.containsKey(normalizedForm.toLowerCase()))
                    w.normalizedForm = uppercasedFormsInMixedCaseNonSentenceStart.get(normalizedForm.toLowerCase()).topWords.elementAt(0);
                else
                    w.normalizedForm = normalizedForm;
            }
            if (w.previous == null && Character.isLowerCase(w.normalizedForm.charAt(0)))
                w.normalizedForm = Character.toUpperCase(w.normalizedForm.charAt(0)) + w.normalizedForm.substring(1);
            w.form = w.normalizedForm;
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) HashMap(java.util.HashMap) CharacteristicWords(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 5 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class Decoder method nullifyTaggerCachedFields.

/*
     * Lbj does some pretty annoying caching. We need this method for the beamsearch and the
     * viterbi.
     */
public static void nullifyTaggerCachedFields(SparseNetworkLearner tagger) {
    NEWord w = new NEWord(new Word("lala1"), null, "O");
    w.parts = new String[0];
    NEWord[] words = { new NEWord(w, null, "O"), new NEWord(w, null, "O"), new NEWord(w, null, "O"), new NEWord(w, null, "O"), new NEWord(w, null, "O"), new NEWord(w, null, "O"), new NEWord(w, null, "O") };
    for (int i = 1; i < words.length; i++) {
        words[i].parts = new String[0];
        words[i].previous = words[i - 1];
        words[i].previousIgnoreSentenceBoundary = words[i - 1];
        words[i - 1].next = words[i];
        words[i - 1].nextIgnoreSentenceBoundary = words[i];
    }
    for (NEWord word : words) word.neTypeLevel1 = word.neTypeLevel2 = "O";
    tagger.classify(words[3]);
}
Also used : NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord) Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Aggregations

NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)18 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)12 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Vector (java.util.Vector)3 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)2 CharacteristicWords (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords)2 File (java.io.File)2 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 Data (edu.illinois.cs.cogcomp.ner.LbjTagger.Data)1 NamedEntity (edu.illinois.cs.cogcomp.ner.LbjTagger.NamedEntity)1 ParametersForLbjCode (edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode)1 MyString (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.MyString)1 OccurrenceCounter (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter)1 THashMap (gnu.trove.map.hash.THashMap)1 IOException (java.io.IOException)1 Hashtable (java.util.Hashtable)1