Search in sources :

Example 6 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class WordTopicAndLayoutFeatures method addDatasets.

/*
     * Note- this assumes that the data is split by documents. So if we choose to ignore the
     * document boundaries, we're in trouble!!!
     */
public static void addDatasets(Vector<LinkedVector> sentences, boolean lowercaseData, double confidenceThreshold) throws Exception {
    if (nb == null || map == null)
        throw new Exception("Topic classifier not initialized!!!");
    String documentText = "";
    Vector<NEWord> docWords = new Vector<>();
    for (int sid = 0; sid < sentences.size(); sid++) {
        LinkedVector s = sentences.elementAt(sid);
        for (int i = 0; i < s.size(); i++) {
            documentText += " " + ((NEWord) s.get(i)).originalForm + " ";
            docWords.addElement((NEWord) s.get(i));
        }
        if (((NEWord) s.get(s.size() - 1)).nextIgnoreSentenceBoundary == null) {
            // this is the last sentence in the document- move on!
            if (lowercaseData)
                documentText = documentText.toLowerCase();
            Document doc = new Document(InFile.tokenize(documentText, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"), -1);
            int label = nb.classify(doc, confidenceThreshold);
            logger.info("*********************\n" + labelnames[label + 1] + "\n*********************\n" + documentText.substring(0, Math.min(documentText.length(), 400)));
            for (int i = 0; i < docWords.size(); i++) wordToTopicIdMap.put(docWords.elementAt(i), label);
            documentText = "";
            docWords = new Vector<>();
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 7 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class TaggedDataWriter method toColumnsFormat.

private static String toColumnsFormat(Data data, NEWord.LabelToLookAt labelType) {
    StringBuilder res = new StringBuilder(data.documents.size() * 1000);
    for (int did = 0; did < data.documents.size(); did++) {
        for (int i = 0; i < data.documents.get(did).sentences.size(); i++) {
            LinkedVector vector = data.documents.get(did).sentences.get(i);
            if (((NEWord) vector.get(0)).previousIgnoreSentenceBoundary == null)
                res.append("O	0	0	O	-X-	-DOCSTART-	x	x	0\n\n");
            for (int j = 0; j < vector.size(); j++) {
                NEWord w = (NEWord) vector.get(j);
                res.append(w.getPrediction(labelType)).append("\t0\t").append(j).append("\tO\tO\t").append(w.form).append("\tx\tx\t0\n");
            }
            res.append("\n");
        }
    }
    return res.toString();
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 8 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class TaggedDataReader method connectSentenceBoundaries.

public static void connectSentenceBoundaries(ArrayList<LinkedVector> sentences) {
    // connecting sentence boundaries
    for (int i = 0; i < sentences.size(); i++) {
        for (int j = 0; j < sentences.get(i).size(); j++) {
            NEWord w = (NEWord) sentences.get(i).get(j);
            w.previousIgnoreSentenceBoundary = (NEWord) w.previous;
            w.nextIgnoreSentenceBoundary = (NEWord) w.next;
        }
        if (i > 0 && sentences.get(i).size() > 0) {
            NEWord w = (NEWord) sentences.get(i).get(0);
            w.previousIgnoreSentenceBoundary = (NEWord) sentences.get(i - 1).get(sentences.get(i - 1).size() - 1);
        }
        if (i < sentences.size() - 1 && sentences.get(i).size() > 0) {
            NEWord w = (NEWord) sentences.get(i).get(sentences.get(i).size() - 1);
            w.nextIgnoreSentenceBoundary = (NEWord) sentences.get(i + 1).get(0);
        }
    }
}
Also used : NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 9 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class BrownClusters method printOovData.

public final void printOovData(Data data) {
    HashMap<String, Boolean> tokensHash = new HashMap<>();
    HashMap<String, Boolean> tokensHashIC = new HashMap<>();
    ArrayList<LinkedVector> sentences = new ArrayList<>();
    for (int docid = 0; docid < data.documents.size(); docid++) for (int sid = 0; sid < data.documents.get(docid).sentences.size(); sid++) sentences.add(data.documents.get(docid).sentences.get(sid));
    for (LinkedVector sentence : sentences) for (int j = 0; j < sentence.size(); j++) {
        String form = ((NEWord) sentence.get(j)).form;
        tokensHash.put(form, true);
        tokensHashIC.put(form.toLowerCase(), true);
    }
    for (THashMap<String, String> wordToPath : wordToPathByResource) {
        HashMap<String, Boolean> oovCaseSensitiveHash = new HashMap<>();
        HashMap<String, Boolean> oovAfterLowercasingHash = new HashMap<>();
        for (LinkedVector sentence : sentences) {
            for (int j = 0; j < sentence.size(); j++) {
                String form = ((NEWord) sentence.get(j)).form;
                if (!wordToPath.containsKey(form)) {
                    oovCaseSensitiveHash.put(form, true);
                }
                if ((!wordToPath.containsKey(form)) && (!wordToPath.containsKey(form.toLowerCase()))) {
                    oovAfterLowercasingHash.put(form.toLowerCase(), true);
                }
            }
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) HashMap(java.util.HashMap) THashMap(gnu.trove.map.hash.THashMap) ArrayList(java.util.ArrayList) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 10 with NEWord

use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.

the class TitleTextNormalizer method getNeighborhoodWordStatistics.

/*
     * the first 2 parameters must be passed. the last 2 places is where I'm keeping the answers
     */
public static void getNeighborhoodWordStatistics(NEWord word, HashMap<NEWord, Boolean> wordsInMixedCasedSentences, HashMap<String, CharacteristicWords> uppercasedFormsInMixedCaseNonSentenceStart, HashMap<String, Boolean> lowecasedForms) {
    NEWord temp = word.previousIgnoreSentenceBoundary;
    int count = 0;
    while (temp != null && count < WindowSize) {
        // we dont want to take into statistics words that begin sentences
        if (wordsInMixedCasedSentences.containsKey(temp) && temp.previous != null) {
            String w = temp.form;
            String key = w.toLowerCase();
            if (Character.isUpperCase(w.charAt(0))) {
                CharacteristicWords topSpellings = new CharacteristicWords(5);
                if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
                    topSpellings = uppercasedFormsInMixedCaseNonSentenceStart.get(key);
                topSpellings.addElement(w, 1);
                uppercasedFormsInMixedCaseNonSentenceStart.put(key, topSpellings);
            }
            if (Character.isLowerCase(w.charAt(0)))
                lowecasedForms.put(key, true);
        }
        count++;
        temp = temp.previousIgnoreSentenceBoundary;
    }
    temp = word.nextIgnoreSentenceBoundary;
    count = 0;
    while (temp != null && count < WindowSize) {
        // we dont want to take into statistics words that begin sentences
        if (wordsInMixedCasedSentences.containsKey(temp) && temp.previous != null) {
            String w = temp.form;
            String key = w.toLowerCase();
            if (Character.isUpperCase(w.charAt(0))) {
                CharacteristicWords topSpellings = new CharacteristicWords(5);
                if (uppercasedFormsInMixedCaseNonSentenceStart.containsKey(key))
                    topSpellings = uppercasedFormsInMixedCaseNonSentenceStart.get(key);
                topSpellings.addElement(w, 1);
                uppercasedFormsInMixedCaseNonSentenceStart.put(key, topSpellings);
            }
            if (Character.isLowerCase(w.charAt(0)))
                lowecasedForms.put(key, true);
        }
        count++;
        temp = temp.nextIgnoreSentenceBoundary;
    }
}
Also used : CharacteristicWords(edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Aggregations

NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)18 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)12 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Vector (java.util.Vector)3 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)2 CharacteristicWords (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords)2 File (java.io.File)2 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 Data (edu.illinois.cs.cogcomp.ner.LbjTagger.Data)1 NamedEntity (edu.illinois.cs.cogcomp.ner.LbjTagger.NamedEntity)1 ParametersForLbjCode (edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode)1 MyString (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.MyString)1 OccurrenceCounter (edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.OccurrenceCounter)1 THashMap (gnu.trove.map.hash.THashMap)1 IOException (java.io.IOException)1 Hashtable (java.util.Hashtable)1