Search in sources :

Example 6 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class WordsToTokens method convert.

/**
     * Given a {@link LinkedVector} containing {@link Word}s, this method
     * creates a new {@link LinkedVector} containing {@link Token}s.
     *
     * @param v A {@link LinkedVector} of {@link Word}s.
     * @return A {@link LinkedVector} of {@link Token}s corresponding to the
     * input {@link Word}s.
     **/
public static LinkedVector convert(LinkedVector v) {
    if (v == null)
        return null;
    if (v.size() == 0)
        return v;
    Word w = (Word) v.get(0);
    Token t = new Token(w, null, null);
    for (w = (Word) w.next; w != null; w = (Word) w.next) {
        t.next = new Token(w, t, null);
        t = (Token) t.next;
    }
    return new LinkedVector(t);
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Example 7 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class Decoder method annotateBIO_AllLevelsWithTaggers.

/**
     * use taggerLevel2=null if you want to use only one level of inference
     */
protected static void annotateBIO_AllLevelsWithTaggers(Data data, NETaggerLevel1 taggerLevel1, NETaggerLevel2 taggerLevel2) throws Exception {
    clearPredictions(data);
    NETaggerLevel1.isTraining = false;
    NETaggerLevel2.isTraining = false;
    GreedyDecoding.annotateGreedy(data, taggerLevel1, 1);
    TextChunkRepresentationManager.changeChunkRepresentation(ParametersForLbjCode.currentParameters.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel1Tagger);
    PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel1, NEWord.LabelToLookAt.PredictionLevel1Tagger);
    // this block runs the level2 tagger
    // Previously checked if features included 'PatternFeatures'
    boolean level2 = ParametersForLbjCode.currentParameters.featuresToUse.containsKey("PredictionsLevel1");
    if (taggerLevel2 != null && level2) {
        // annotate with patterns
        PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, 0.0, NEWord.LabelToLookAt.PredictionLevel1Tagger);
        TwoLayerPredictionAggregationFeatures.setLevel1AggregationFeatures(data, false);
        GreedyDecoding.annotateGreedy(data, taggerLevel2, 2);
        PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel2, NEWord.LabelToLookAt.PredictionLevel2Tagger);
        TextChunkRepresentationManager.changeChunkRepresentation(ParametersForLbjCode.currentParameters.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel2Tagger);
    } else {
        for (int docid = 0; docid < data.documents.size(); docid++) {
            ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
            for (LinkedVector sentence : sentences) for (int i = 0; i < sentence.size(); i++) {
                NEWord w = (NEWord) sentence.get(i);
                w.neTypeLevel2 = w.neTypeLevel1;
            }
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 8 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class AnnotatedDocument method init.

public void init() {
    HashMap<String, ArrayList<String>> out = new HashMap<>();
    StringBuffer res = new StringBuffer();
    for (int docid = 0; docid < data.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
        for (int i = 0; i < sentences.size(); i++) {
            LinkedVector vector = sentences.get(i);
            boolean open = false;
            String[] predictions = new String[vector.size()];
            String[] words = new String[vector.size()];
            for (int j = 0; j < vector.size(); j++) {
                predictions[j] = ((NEWord) vector.get(j)).neTypeLevel2;
                words[j] = ((NEWord) vector.get(j)).form;
            }
            StringBuffer entity = null;
            String tag = null;
            for (int j = 0; j < vector.size(); j++) {
                if (predictions[j].startsWith("B-") || (j > 0 && predictions[j].startsWith("I-") && (!predictions[j - 1].endsWith(predictions[j].substring(2))))) {
                    res.append("[").append(predictions[j].substring(2)).append(" ");
                    entity = new StringBuffer();
                    open = true;
                    tag = predictions[j].substring(2);
                }
                res.append(words[j]).append(" ");
                if (open) {
                    entity.append(words[j]).append(" ");
                    boolean close = false;
                    if (j == vector.size() - 1) {
                        close = true;
                    } else {
                        if (predictions[j + 1].startsWith("B-"))
                            close = true;
                        if (predictions[j + 1].equals("O"))
                            close = true;
                        if (predictions[j + 1].indexOf('-') > -1 && (!predictions[j].endsWith(predictions[j + 1].substring(2))))
                            close = true;
                    }
                    if (close) {
                        String str_res = res.toString().trim();
                        res = new StringBuffer(str_res);
                        res.append("] ");
                        open = false;
                        if (out.containsKey(tag))
                            out.get(tag).add(entity.toString().trim());
                        else {
                            ArrayList<String> entities = new ArrayList<>();
                            entities.add(entity.toString().trim());
                            out.put(tag, entities);
                        }
                    }
                }
            }
        }
    }
    taggedLine = res.toString();
    labels = out;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList)

Example 9 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class BracketFileReader method parseTextWithBrackets.

public static NERDocument parseTextWithBrackets(String annotatedText, String docname) throws Exception {
    if (annotatedText.replace(" ", "").replace("\n", "").replace("\t", "").length() == 0)
        return new NERDocument(new ArrayList<LinkedVector>(), docname);
    // can include newlines!!!!
    Vector<String> bracketTokens = new Vector<>();
    Vector<String> bracketTokensTags = new Vector<>();
    parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens);
    StringBuilder buff = new StringBuilder(bracketTokens.size() * 20);
    for (int i = 0; i < bracketTokens.size(); i++) buff.append(bracketTokens.elementAt(i)).append(" ");
    // the tokens below will have no newline characters.
    // logger.info("Raw text: "+buff);
    Vector<Vector<String>> parsedTokens = PlainTextReader.sentenceSplitAndTokenizeText(buff.toString());
    // now we need to align the bracket tokens to the sentence split and tokenized tokens.
    // there are two issues to be careful with -
    // 1) The bracket tokens may have newline characters as individual tokens, the others will
    // not
    // 2) The tokenized/sentence split tokens may be bracket tokens broken into separate tokens.
    Vector<String> parsedTokensFlat = new Vector<>();
    for (int i = 0; i < parsedTokens.size(); i++) for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) parsedTokensFlat.addElement(parsedTokens.elementAt(i).elementAt(j));
    // logger.info("----"+parsedTokensFlat.size());
    // to be filled later
    Vector<String> parsedTokensTagsFlat = new Vector<>();
    StringBuilder bracketTokensText = new StringBuilder(bracketTokens.size() * 20);
    StringBuilder parsedTokensText = new StringBuilder(parsedTokensFlat.size() * 20);
    int bracketsTokensPos = 0;
    int parsedTokensPos = 0;
    while (bracketsTokensPos < bracketTokens.size()) {
        while (bracketsTokensPos < bracketTokens.size() && bracketTokens.elementAt(bracketsTokensPos).equals("\n")) bracketsTokensPos++;
        if (bracketsTokensPos < bracketTokens.size()) {
            bracketTokensText.append(" ").append(bracketTokens.elementAt(bracketsTokensPos));
            String currentLabel = bracketTokensTags.elementAt(bracketsTokensPos);
            parsedTokensTagsFlat.addElement(currentLabel);
            parsedTokensText.append(" ").append(parsedTokensFlat.elementAt(parsedTokensPos));
            parsedTokensPos++;
            while ((!bracketTokensText.toString().equals(parsedTokensText.toString())) && parsedTokensPos < parsedTokensFlat.size()) {
                if (currentLabel.startsWith("B-"))
                    parsedTokensTagsFlat.addElement("I-" + currentLabel.substring(2));
                else
                    parsedTokensTagsFlat.addElement(currentLabel);
                parsedTokensText.append(parsedTokensFlat.elementAt(parsedTokensPos));
                parsedTokensPos++;
            }
            if (!bracketTokensText.toString().equals(parsedTokensText.toString()))
                throw new Exception("Error aligning raw brackets tokens to token/sentence split tokens\nBrackets token text till now:\n" + bracketTokensText + "\nTokenized text till now:\n" + parsedTokensText);
            bracketsTokensPos++;
        }
    }
    // ok, we're done, just building the output sentences
    ArrayList<LinkedVector> res = new ArrayList<>();
    parsedTokensPos = 0;
    for (int i = 0; i < parsedTokens.size(); i++) {
        LinkedVector sentence = new LinkedVector();
        for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) {
            NEWord.addTokenToSentence(sentence, parsedTokensFlat.elementAt(parsedTokensPos), parsedTokensTagsFlat.elementAt(parsedTokensPos));
            parsedTokensPos++;
        }
        res.add(sentence);
    }
    return new NERDocument(res, docname);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Example 10 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class ExpressiveFeaturesAnnotator method annotate.

/**
     * Do not worry about the brown clusters and word embeddings, this stuff is added on the fly in
     * the .lbj feature generators...
     */
public static void annotate(Data data) throws Exception {
    /*
         * must be after the linkability has been initialized!!!
         */
    if (ParametersForLbjCode.currentParameters.normalizeTitleText) {
        // logger.info("Normalizing text case ...");
        TitleTextNormalizer.normalizeCase(data);
    }
    if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("BrownClusterPaths")) {
        // logger.info("Brown clusters OOV statistics:");
        BrownClusters.get().printOovData(data);
    }
    if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("WordEmbeddings")) {
        // logger.info("Word Embeddings OOV statistics:");
        WordEmbeddings.printOovData(data);
    }
    // annotating with Gazetteers;
    if (ParametersForLbjCode.currentParameters.featuresToUse != null) {
        if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("GazetteersFeatures")) {
            // first make sure the gazetteers arrays are inited for each word.
            for (int docid = 0; docid < data.documents.size(); docid++) {
                ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
                for (LinkedVector sentence : sentences) {
                    for (int j = 0; j < sentence.size(); j++) {
                        NEWord ww = (NEWord) sentence.get(j);
                        if (ww.gazetteers == null)
                            ww.gazetteers = new ArrayList<>();
                    }
                }
            }
            Gazetteers gaz = GazetteersFactory.get();
            for (int docid = 0; docid < data.documents.size(); docid++) {
                ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
                for (LinkedVector vector : sentences) {
                    for (int j = 0; j < vector.size(); j++) gaz.annotate((NEWord) vector.get(j));
                }
            }
            // sort the gazetteers.
            for (int docid = 0; docid < data.documents.size(); docid++) {
                ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
                for (LinkedVector vector : sentences) {
                    for (int j = 0; j < vector.size(); j++) Collections.sort(((NEWord) vector.get(j)).gazetteers);
                }
            }
        }
    }
    // annotating the nonlocal features;
    for (int docid = 0; docid < data.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
        for (LinkedVector vector : sentences) {
            for (int j = 0; j < vector.size(); j++) ContextAggregation.annotate((NEWord) vector.get(j));
        }
    }
    /*
         * Note that this piece of code must be the last!!! Here we are adding as features the
         * predictions of the aux models
         */
    for (int i = 0; i < ParametersForLbjCode.currentParameters.auxiliaryModels.size(); i++) {
        ParametersForLbjCode currentModel = ParametersForLbjCode.currentParameters;
        ParametersForLbjCode.currentParameters = ParametersForLbjCode.currentParameters.auxiliaryModels.elementAt(i);
        Decoder.annotateDataBIO(data, (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1, (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2);
        Vector<Data> v = new Vector<>();
        v.addElement(data);
        NETesterMultiDataset.printAllTestResultsAsOneDataset(v, false);
        TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, data, NEWord.LabelToLookAt.PredictionLevel1Tagger);
        TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, data, NEWord.LabelToLookAt.PredictionLevel2Tagger);
        // addAuxiliaryClassifierFeatures(data, "aux_model_" + i);
        ParametersForLbjCode.currentParameters = currentModel;
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Aggregations

LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)46 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)9 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)9 ArrayList (java.util.ArrayList)8 Vector (java.util.Vector)8 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 File (java.io.File)3 HashMap (java.util.HashMap)3 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)2 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)2 OutFile (edu.illinois.cs.cogcomp.ner.IO.OutFile)2 Matcher (java.util.regex.Matcher)2 ChunkLabel (edu.illinois.cs.cogcomp.chunker.main.lbjava.ChunkLabel)1 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1