Search in sources :

Example 26 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class NETesterMultiDataset method dumpFeaturesLabeledData.

/**
     * NB: assuming column format
     */
public static void dumpFeaturesLabeledData(String testDatapath, String outDatapath) throws Exception {
    FeaturesLevel1SharedWithLevel2 features1 = new FeaturesLevel1SharedWithLevel2();
    FeaturesLevel2 features2 = new FeaturesLevel2();
    NETaggerLevel1 taggerLevel1 = new NETaggerLevel1(ParametersForLbjCode.currentParameters.pathToModelFile + ".level1", ParametersForLbjCode.currentParameters.pathToModelFile + ".level1.lex");
    NETaggerLevel2 taggerLevel2 = new NETaggerLevel2(ParametersForLbjCode.currentParameters.pathToModelFile + ".level2", ParametersForLbjCode.currentParameters.pathToModelFile + ".level2.lex");
    File f = new File(testDatapath);
    Vector<String> inFiles = new Vector<>();
    Vector<String> outFiles = new Vector<>();
    if (f.isDirectory()) {
        String[] files = f.list();
        for (String file : files) if (!file.startsWith(".")) {
            inFiles.addElement(testDatapath + "/" + file);
            outFiles.addElement(outDatapath + "/" + file);
        }
    } else {
        inFiles.addElement(testDatapath);
        outFiles.addElement(outDatapath);
    }
    for (int fileId = 0; fileId < inFiles.size(); fileId++) {
        Data testData = new Data(inFiles.elementAt(fileId), inFiles.elementAt(fileId), "-c", new String[] {}, new String[] {});
        ExpressiveFeaturesAnnotator.annotate(testData);
        Decoder.annotateDataBIO(testData, taggerLevel1, taggerLevel2);
        OutFile out = new OutFile(outFiles.elementAt(fileId));
        for (int docid = 0; docid < testData.documents.size(); docid++) {
            ArrayList<LinkedVector> sentences = testData.documents.get(docid).sentences;
            for (LinkedVector sentence : sentences) {
                for (int j = 0; j < sentence.size(); j++) {
                    NEWord w = (NEWord) sentence.get(j);
                    out.print(w.neLabel + "\t" + w.form + "\t");
                    FeatureVector fv1 = features1.classify(w);
                    FeatureVector fv2 = features2.classify(w);
                    for (int k = 0; k < fv1.size(); k++) {
                        String s = fv1.getFeature(k).toString();
                        out.print(" " + s.substring(s.indexOf(':') + 1, s.length()));
                    }
                    for (int k = 0; k < fv2.size(); k++) {
                        String s = fv2.getFeature(k).toString();
                        out.print(" " + s.substring(s.indexOf(':') + 1, s.length()));
                    }
                    out.println("");
                }
                out.println("");
            }
        }
        out.close();
    }
}
Also used : FeatureVector(edu.illinois.cs.cogcomp.lbjava.classify.FeatureVector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) File(java.io.File) OutFile(edu.illinois.cs.cogcomp.ner.IO.OutFile) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Vector(java.util.Vector) FeatureVector(edu.illinois.cs.cogcomp.lbjava.classify.FeatureVector) OutFile(edu.illinois.cs.cogcomp.ner.IO.OutFile)

Example 27 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class NETesterMultiDataset method reportPredictions.

public static void reportPredictions(Data dataSet, TestDiscrete resultsTokenLevel1, TestDiscrete resultsTokenLevel2, TestDiscrete resultsPhraseLevel1, TestDiscrete resultsPhraseLevel2, TestDiscrete resultsByBILOU, TestDiscrete resultsSegmentation) {
    NELabel labeler = new NELabel();
    Data dataCloneWithanonymizedLabels = new Data();
    for (int docid = 0; docid < dataSet.documents.size(); docid++) {
        ArrayList<LinkedVector> originalSentences = dataSet.documents.get(docid).sentences;
        ArrayList<LinkedVector> clonedSentences = new ArrayList<>();
        for (LinkedVector originalSentence : originalSentences) {
            LinkedVector sentence = new LinkedVector();
            for (int j = 0; j < originalSentence.size(); j++) {
                NEWord originalW = (NEWord) originalSentence.get(j);
                NEWord w = new NEWord(new Word(originalW.form), null, null);
                w.neLabel = originalW.neLabel;
                if (w.neLabel.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neLabel.substring(2)))
                    w.neLabel = "O";
                w.neTypeLevel1 = originalW.neTypeLevel1;
                if (w.neLabel.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neLabel.substring(2))) {
                    w.neLabel = w.neLabel.substring(0, 2) + "ENTITY";
                // logger.info("replace!!!");
                }
                w.neTypeLevel1 = originalW.neTypeLevel1;
                if (w.neTypeLevel1.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neTypeLevel1.substring(2)))
                    w.neTypeLevel1 = "O";
                if (w.neTypeLevel1.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neTypeLevel1.substring(2)))
                    w.neTypeLevel1 = w.neTypeLevel1.substring(0, 2) + "ENTITY";
                w.neTypeLevel2 = originalW.neTypeLevel2;
                if (w.neTypeLevel2.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neTypeLevel2.substring(2)))
                    w.neTypeLevel2 = "O";
                if (w.neTypeLevel2.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neTypeLevel2.substring(2)))
                    w.neTypeLevel2 = w.neTypeLevel2.substring(0, 2) + "ENTITY";
                sentence.add(w);
            }
            clonedSentences.add(sentence);
        }
        NERDocument clonedDoc = new NERDocument(clonedSentences, "fake" + docid);
        dataCloneWithanonymizedLabels.documents.add(clonedDoc);
    }
    for (int docid = 0; docid < dataCloneWithanonymizedLabels.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = dataCloneWithanonymizedLabels.documents.get(docid).sentences;
        for (LinkedVector vector : sentences) {
            int N = vector.size();
            String[] predictionsLevel1 = new String[N], predictionsLevel2 = new String[N], labels = new String[N];
            for (int i = 0; i < N; ++i) {
                predictionsLevel1[i] = ((NEWord) vector.get(i)).neTypeLevel1;
                predictionsLevel2[i] = ((NEWord) vector.get(i)).neTypeLevel2;
                labels[i] = labeler.discreteValue(vector.get(i));
                String pLevel1 = predictionsLevel1[i];
                String pLevel2 = predictionsLevel2[i];
                if (pLevel1.indexOf('-') > -1)
                    pLevel1 = pLevel1.substring(2);
                if (pLevel2.indexOf('-') > -1)
                    pLevel2 = pLevel2.substring(2);
                String l = labels[i];
                if (l.indexOf('-') > -1)
                    l = l.substring(2);
                resultsTokenLevel1.reportPrediction(pLevel1, l);
                resultsTokenLevel2.reportPrediction(pLevel2, l);
            }
            // getting phrase level accuracy level1
            for (int i = 0; i < N; ++i) {
                String p = "O", l = "O";
                int pEnd = -1, lEnd = -1;
                if (predictionsLevel1[i].startsWith("B-") || predictionsLevel1[i].startsWith("I-") && (i == 0 || !predictionsLevel1[i - 1].endsWith(predictionsLevel1[i].substring(2)))) {
                    p = predictionsLevel1[i].substring(2);
                    pEnd = i;
                    while (pEnd + 1 < N && predictionsLevel1[pEnd + 1].equals("I-" + p)) ++pEnd;
                }
                if (labels[i].startsWith("B-")) {
                    l = labels[i].substring(2);
                    lEnd = i;
                    while (lEnd + 1 < N && labels[lEnd + 1].equals("I-" + l)) ++lEnd;
                }
                if (!p.equals("O") || !l.equals("O")) {
                    if (pEnd == lEnd)
                        resultsPhraseLevel1.reportPrediction(p, l);
                    else {
                        if (!p.equals("O"))
                            resultsPhraseLevel1.reportPrediction(p, "O");
                        if (!l.equals("O"))
                            resultsPhraseLevel1.reportPrediction("O", l);
                    }
                }
            }
            // getting phrase level accuracy level2
            for (int i = 0; i < N; ++i) {
                String p = "O", l = "O";
                int pEnd = -1, lEnd = -1;
                if (predictionsLevel2[i].startsWith("B-") || predictionsLevel2[i].startsWith("I-") && (i == 0 || !predictionsLevel2[i - 1].endsWith(predictionsLevel2[i].substring(2)))) {
                    p = predictionsLevel2[i].substring(2);
                    pEnd = i;
                    while (pEnd + 1 < N && predictionsLevel2[pEnd + 1].equals("I-" + p)) ++pEnd;
                }
                if (labels[i].startsWith("B-")) {
                    l = labels[i].substring(2);
                    lEnd = i;
                    while (lEnd + 1 < N && labels[lEnd + 1].equals("I-" + l)) ++lEnd;
                }
                if (!p.equals("O") || !l.equals("O")) {
                    if (pEnd == lEnd)
                        resultsPhraseLevel2.reportPrediction(p, l);
                    else {
                        if (!p.equals("O"))
                            resultsPhraseLevel2.reportPrediction(p, "O");
                        if (!l.equals("O"))
                            resultsPhraseLevel2.reportPrediction("O", l);
                    }
                }
            }
        }
    }
    TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, dataCloneWithanonymizedLabels, NEWord.LabelToLookAt.GoldLabel);
    TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, dataCloneWithanonymizedLabels, NEWord.LabelToLookAt.PredictionLevel2Tagger);
    for (int docid = 0; docid < dataCloneWithanonymizedLabels.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = dataCloneWithanonymizedLabels.documents.get(docid).sentences;
        for (LinkedVector sentence : sentences) for (int j = 0; j < sentence.size(); j++) {
            NEWord w = (NEWord) sentence.get(j);
            String bracketTypePrediction = w.neTypeLevel2;
            if (bracketTypePrediction.indexOf('-') > 0)
                bracketTypePrediction = bracketTypePrediction.substring(0, 1);
            String bracketTypeLabel = w.neLabel;
            if (bracketTypeLabel.indexOf('-') > 0)
                bracketTypeLabel = bracketTypeLabel.substring(0, 1);
            resultsByBILOU.reportPrediction(w.neTypeLevel2, w.neLabel);
            resultsSegmentation.reportPrediction(bracketTypePrediction, bracketTypeLabel);
        }
    }
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList)

Example 28 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class NEWord method splitWord.

/*
     * Used for some tokenization schemes.
     */
private static Vector<NEWord> splitWord(NEWord word) {
    String[] sentence = { word.form + " " };
    Parser parser = new WordSplitter(new SentenceSplitter(sentence));
    LinkedVector words = (LinkedVector) parser.next();
    Vector<NEWord> res = new Vector<>();
    if (words == null) {
        res.add(word);
        return res;
    }
    String label = word.neLabel;
    for (int i = 0; i < words.size(); i++) {
        if (label.contains("B-") && i > 0)
            label = "I-" + label.substring(2);
        NEWord w = new NEWord(new Word(((Word) words.get(i)).form), null, label);
        res.addElement(w);
    }
    return res;
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) WordSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.WordSplitter) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Parser(edu.illinois.cs.cogcomp.lbjava.parse.Parser)

Example 29 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class NERAnnotator method addView.

/**
     * Generate the view representing the list of extracted entities and adds it the
     * {@link TextAnnotation}.
     */
@Override
public void addView(TextAnnotation ta) {
    // convert this data structure into one the NER package can deal with.
    ArrayList<LinkedVector> sentences = new ArrayList<>();
    String[] tokens = ta.getTokens();
    int[] tokenindices = new int[tokens.length];
    int tokenIndex = 0;
    int neWordIndex = 0;
    for (int i = 0; i < ta.getNumberOfSentences(); i++) {
        Sentence sentence = ta.getSentence(i);
        String[] wtoks = sentence.getTokens();
        LinkedVector words = new LinkedVector();
        for (String w : wtoks) {
            if (w.length() > 0) {
                NEWord.addTokenToSentence(words, w, "unlabeled");
                tokenindices[neWordIndex] = tokenIndex;
                neWordIndex++;
            } else {
                logger.error("Bad (zero length) token.");
            }
            tokenIndex++;
        }
        if (words.size() > 0)
            sentences.add(words);
    }
    // Do the annotation.
    Data data = new Data(new NERDocument(sentences, "input"));
    try {
        ExpressiveFeaturesAnnotator.annotate(data);
        Decoder.annotateDataBIO(data, t1, t2);
    } catch (Exception e) {
        logger.error("Cannot annotate the text, the exception was: ", e);
        return;
    }
    // now we have the parsed entities, construct the view object.
    ArrayList<LinkedVector> nerSentences = data.documents.get(0).sentences;
    SpanLabelView nerView = new SpanLabelView(getViewName(), ta);
    // the data always has a single document
    // each LinkedVector in data corresponds to a sentence.
    int tokenoffset = 0;
    for (LinkedVector vector : nerSentences) {
        boolean open = false;
        // there should be a 1:1 mapping btw sentence tokens in record and words/predictions
        // from NER.
        int startIndex = -1;
        String label = null;
        for (int j = 0; j < vector.size(); j++, tokenoffset++) {
            NEWord neWord = (NEWord) (vector.get(j));
            String prediction = neWord.neTypeLevel2;
            // inefficient, use enums, or nominalized indexes for this sort of thing.
            if (prediction.startsWith("B-")) {
                startIndex = tokenoffset;
                label = prediction.substring(2);
                open = true;
            } else if (j > 0) {
                String previous_prediction = ((NEWord) vector.get(j - 1)).neTypeLevel2;
                if (prediction.startsWith("I-") && (!previous_prediction.endsWith(prediction.substring(2)))) {
                    startIndex = tokenoffset;
                    label = prediction.substring(2);
                    open = true;
                }
            }
            if (open) {
                boolean close = false;
                if (j == vector.size() - 1) {
                    close = true;
                } else {
                    String next_prediction = ((NEWord) vector.get(j + 1)).neTypeLevel2;
                    if (next_prediction.startsWith("B-"))
                        close = true;
                    if (next_prediction.equals("O"))
                        close = true;
                    if (next_prediction.indexOf('-') > -1 && (!prediction.endsWith(next_prediction.substring(2))))
                        close = true;
                }
                if (close) {
                    int s = tokenindices[startIndex];
                    /**
                         * MS: fixed bug. Originally, e was set using tokenindices[tokenoffset], but
                         * tokenoffset can reach tokens.length) and this exceeds array length.
                         * Constituent constructor requires one-past-the-end token indexing,
                         * requiring e > s. Hence the complicated setting of endIndex/e below.
                         */
                    int endIndex = Math.min(tokenoffset + 1, tokens.length - 1);
                    int e = tokenindices[endIndex];
                    if (e <= s)
                        e = s + 1;
                    nerView.addSpanLabel(s, e, label, 1d);
                    open = false;
                }
            }
        }
    }
    ta.addView(viewName, nerView);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) IOException(java.io.IOException) Sentence(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)

Example 30 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class TaggedDataWriter method toBracketsFormat.

/*
     * labelType=NEWord.GoldLabel/NEWord.PredictionLevel2Tagger/NEWord.PredictionLevel1Tagger
     * 
     * Note : the only reason this function is public is because we want to be able to use it in the
     * demo and insert html tags into the string
     */
public static String toBracketsFormat(Data data, NEWord.LabelToLookAt labelType) {
    StringBuilder res = new StringBuilder(data.documents.size() * 1000);
    for (int did = 0; did < data.documents.size(); did++) {
        for (int i = 0; i < data.documents.get(did).sentences.size(); i++) {
            LinkedVector vector = data.documents.get(did).sentences.get(i);
            boolean open = false;
            String[] predictions = new String[vector.size()];
            String[] words = new String[vector.size()];
            for (int j = 0; j < vector.size(); j++) {
                predictions[j] = null;
                if (labelType == NEWord.LabelToLookAt.PredictionLevel2Tagger)
                    predictions[j] = ((NEWord) vector.get(j)).neTypeLevel2;
                if (labelType == NEWord.LabelToLookAt.PredictionLevel1Tagger)
                    predictions[j] = ((NEWord) vector.get(j)).neTypeLevel1;
                if (labelType == NEWord.LabelToLookAt.GoldLabel)
                    predictions[j] = ((NEWord) vector.get(j)).neLabel;
                words[j] = ((NEWord) vector.get(j)).form;
            }
            for (int j = 0; j < vector.size(); j++) {
                if (predictions[j].startsWith("B-") || (j > 0 && predictions[j].startsWith("I-") && (!predictions[j - 1].endsWith(predictions[j].substring(2))))) {
                    res.append("[").append(predictions[j].substring(2)).append(" ");
                    open = true;
                }
                res.append(words[j]).append(" ");
                if (open) {
                    boolean close = false;
                    if (j == vector.size() - 1) {
                        close = true;
                    } else {
                        if (predictions[j + 1].startsWith("B-"))
                            close = true;
                        if (predictions[j + 1].equals("O"))
                            close = true;
                        if (predictions[j + 1].indexOf('-') > -1 && (!predictions[j].endsWith(predictions[j + 1].substring(2))))
                            close = true;
                    }
                    if (close) {
                        res.append(" ] ");
                        open = false;
                    }
                }
            }
            res.append("\n");
        }
    }
    return res.toString();
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Aggregations

LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)46 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)9 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)9 ArrayList (java.util.ArrayList)8 Vector (java.util.Vector)8 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 File (java.io.File)3 HashMap (java.util.HashMap)3 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)2 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)2 OutFile (edu.illinois.cs.cogcomp.ner.IO.OutFile)2 Matcher (java.util.regex.Matcher)2 ChunkLabel (edu.illinois.cs.cogcomp.chunker.main.lbjava.ChunkLabel)1 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1