Search in sources :

Example 6 with Word

use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.

the class LBJavaUtils method recordToLBJTokens.

/**
     * Converts a record into LBJ Tokens for use with LBJ classifiers. If part of speech is present
     * in record, it is added to the LBJ tokens.
     */
public static List<Token> recordToLBJTokens(TextAnnotation record) {
    List<Token> lbjTokens = new LinkedList<>();
    List<List<String>> sentences = tokensAsStrings(record.getView(ViewNames.TOKENS).getConstituents(), record.getView(ViewNames.SENTENCE).getConstituents(), record.getText());
    List<Constituent> tags = null;
    if (record.hasView(ViewNames.POS))
        tags = record.getView(ViewNames.POS).getConstituents();
    int tagIndex = 0;
    for (List<String> sentence : sentences) {
        boolean opendblquote = true;
        Word wprevious = null;
        Token tprevious = null;
        for (String token : sentence) {
            if (token.equals("\"")) {
                token = opendblquote ? "``" : "''";
                opendblquote = !opendblquote;
            } else if (token.equals("(")) {
                token = "-LRB-";
            } else if (token.equals(")")) {
                token = "-RRB-";
            } else if (token.equals("{")) {
                token = "-LCB-";
            } else if (token.equals("}")) {
                token = "-RCB-";
            } else if (token.equals("[")) {
                token = "-LSB-";
            } else if (token.equals("]")) {
                token = "-RSB-";
            }
            Word wcurrent = new Word(token, wprevious);
            if (null != tags && !tags.isEmpty()) {
                Constituent tag = tags.get(tagIndex++);
                wcurrent.partOfSpeech = tag.getLabel();
            }
            Token tcurrent = new Token(wcurrent, tprevious, "");
            lbjTokens.add(tcurrent);
            if (tprevious != null) {
                tprevious.next = tcurrent;
            }
            wprevious = wcurrent;
            tprevious = tcurrent;
        }
    }
    return lbjTokens;
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token) List(java.util.List) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 7 with Word

use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.

the class MikheevLearner method learn.

/**
     * Trains the learning algorithm given an object as an example.
     *
     * @param example An example of the desired learned classifier's behavior.
     **/
public void learn(Object example) {
    String form = extractor.discreteValue(example);
    String label = labeler.discreteValue(example);
    if (form.length() >= 5) {
        boolean allLetters = true;
        for (int i = form.length() - 3; i < form.length() && allLetters; ++i) allLetters = Character.isLetter(form.charAt(i));
        if (allLetters) {
            Word w = (Word) example;
            HashMap<String, TreeMap<String, Integer>> t = null;
            if (w.capitalized) {
                if (w.previous == null)
                    t = firstCapitalized;
                else
                    t = notFirstCapitalized;
            } else {
                if (form.contains("-"))
                    return;
                t = table;
            }
            form = form.toLowerCase();
            increment(t, form.substring(form.length() - 3), label);
            if (form.length() >= 6 && Character.isLetter(form.charAt(form.length() - 4)))
                increment(t, form.substring(form.length() - 4), label);
        }
    }
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word)

Example 8 with Word

use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testWhitespaceBehavior.

/**
     * Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
     * xml markup has been replaced with whitespace of equal span.
     */
@Test
public void testWhitespaceBehavior() {
    String origText = null;
    try {
        origText = LineIO.slurp(INFILE);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
    Matcher xmlMatcher = xmlTagPattern.matcher(origText);
    StringBuilder cleanTextBldr = new StringBuilder();
    int lastAppendedCharOffset = 0;
    while (xmlMatcher.find()) {
        int start = xmlMatcher.start();
        int end = xmlMatcher.end();
        cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
        for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
        lastAppendedCharOffset = end;
    }
    cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
    String cleanText = cleanTextBldr.toString();
    // count whitespace chars in string
    // check token offsets in tokens returned by SentenceSplitter
    Pattern sun = Pattern.compile("\\w*Sun\\w*");
    Matcher sunMatcher = sun.matcher(cleanText);
    Set<IntPair> sunSpans = new HashSet<>();
    while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
    SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
    Sentence[] sents = splitter.splitAll();
    Sentence s = sents[0];
    LinkedVector words = s.wordSplit();
    for (int i = 0; i < words.size(); ++i) {
        Word firstWord = (Word) words.get(0);
        if ("Sun".equals(firstWord.form)) {
            IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
            assertTrue(sunSpans.contains(tokenCharOffsets));
        }
    }
    StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
    Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
    assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
    for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
        String tok = tokenInfo.getTokens()[i];
        if (tok.equals("Sun")) {
            IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
            if (!sunSpans.contains(tokCharOffsets)) {
                String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
                System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
            }
            assertTrue(sunSpans.contains(tokCharOffsets));
        }
    }
    TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
    assertNotNull(statefulTa);
}
Also used : Pattern(java.util.regex.Pattern) Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Matcher(java.util.regex.Matcher) FileNotFoundException(java.io.FileNotFoundException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 9 with Word

use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.

the class NETesterMultiDataset method reportPredictions.

public static void reportPredictions(Data dataSet, TestDiscrete resultsTokenLevel1, TestDiscrete resultsTokenLevel2, TestDiscrete resultsPhraseLevel1, TestDiscrete resultsPhraseLevel2, TestDiscrete resultsByBILOU, TestDiscrete resultsSegmentation) {
    NELabel labeler = new NELabel();
    Data dataCloneWithanonymizedLabels = new Data();
    for (int docid = 0; docid < dataSet.documents.size(); docid++) {
        ArrayList<LinkedVector> originalSentences = dataSet.documents.get(docid).sentences;
        ArrayList<LinkedVector> clonedSentences = new ArrayList<>();
        for (LinkedVector originalSentence : originalSentences) {
            LinkedVector sentence = new LinkedVector();
            for (int j = 0; j < originalSentence.size(); j++) {
                NEWord originalW = (NEWord) originalSentence.get(j);
                NEWord w = new NEWord(new Word(originalW.form), null, null);
                w.neLabel = originalW.neLabel;
                if (w.neLabel.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neLabel.substring(2)))
                    w.neLabel = "O";
                w.neTypeLevel1 = originalW.neTypeLevel1;
                if (w.neLabel.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neLabel.substring(2))) {
                    w.neLabel = w.neLabel.substring(0, 2) + "ENTITY";
                // logger.info("replace!!!");
                }
                w.neTypeLevel1 = originalW.neTypeLevel1;
                if (w.neTypeLevel1.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neTypeLevel1.substring(2)))
                    w.neTypeLevel1 = "O";
                if (w.neTypeLevel1.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neTypeLevel1.substring(2)))
                    w.neTypeLevel1 = w.neTypeLevel1.substring(0, 2) + "ENTITY";
                w.neTypeLevel2 = originalW.neTypeLevel2;
                if (w.neTypeLevel2.indexOf('-') > -1 && dataSet.labelsToIgnoreForEvaluation.contains(w.neTypeLevel2.substring(2)))
                    w.neTypeLevel2 = "O";
                if (w.neTypeLevel2.indexOf('-') > -1 && dataSet.labelsToAnonymizeForEvaluation.contains(w.neTypeLevel2.substring(2)))
                    w.neTypeLevel2 = w.neTypeLevel2.substring(0, 2) + "ENTITY";
                sentence.add(w);
            }
            clonedSentences.add(sentence);
        }
        NERDocument clonedDoc = new NERDocument(clonedSentences, "fake" + docid);
        dataCloneWithanonymizedLabels.documents.add(clonedDoc);
    }
    for (int docid = 0; docid < dataCloneWithanonymizedLabels.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = dataCloneWithanonymizedLabels.documents.get(docid).sentences;
        for (LinkedVector vector : sentences) {
            int N = vector.size();
            String[] predictionsLevel1 = new String[N], predictionsLevel2 = new String[N], labels = new String[N];
            for (int i = 0; i < N; ++i) {
                predictionsLevel1[i] = ((NEWord) vector.get(i)).neTypeLevel1;
                predictionsLevel2[i] = ((NEWord) vector.get(i)).neTypeLevel2;
                labels[i] = labeler.discreteValue(vector.get(i));
                String pLevel1 = predictionsLevel1[i];
                String pLevel2 = predictionsLevel2[i];
                if (pLevel1.indexOf('-') > -1)
                    pLevel1 = pLevel1.substring(2);
                if (pLevel2.indexOf('-') > -1)
                    pLevel2 = pLevel2.substring(2);
                String l = labels[i];
                if (l.indexOf('-') > -1)
                    l = l.substring(2);
                resultsTokenLevel1.reportPrediction(pLevel1, l);
                resultsTokenLevel2.reportPrediction(pLevel2, l);
            }
            // getting phrase level accuracy level1
            for (int i = 0; i < N; ++i) {
                String p = "O", l = "O";
                int pEnd = -1, lEnd = -1;
                if (predictionsLevel1[i].startsWith("B-") || predictionsLevel1[i].startsWith("I-") && (i == 0 || !predictionsLevel1[i - 1].endsWith(predictionsLevel1[i].substring(2)))) {
                    p = predictionsLevel1[i].substring(2);
                    pEnd = i;
                    while (pEnd + 1 < N && predictionsLevel1[pEnd + 1].equals("I-" + p)) ++pEnd;
                }
                if (labels[i].startsWith("B-")) {
                    l = labels[i].substring(2);
                    lEnd = i;
                    while (lEnd + 1 < N && labels[lEnd + 1].equals("I-" + l)) ++lEnd;
                }
                if (!p.equals("O") || !l.equals("O")) {
                    if (pEnd == lEnd)
                        resultsPhraseLevel1.reportPrediction(p, l);
                    else {
                        if (!p.equals("O"))
                            resultsPhraseLevel1.reportPrediction(p, "O");
                        if (!l.equals("O"))
                            resultsPhraseLevel1.reportPrediction("O", l);
                    }
                }
            }
            // getting phrase level accuracy level2
            for (int i = 0; i < N; ++i) {
                String p = "O", l = "O";
                int pEnd = -1, lEnd = -1;
                if (predictionsLevel2[i].startsWith("B-") || predictionsLevel2[i].startsWith("I-") && (i == 0 || !predictionsLevel2[i - 1].endsWith(predictionsLevel2[i].substring(2)))) {
                    p = predictionsLevel2[i].substring(2);
                    pEnd = i;
                    while (pEnd + 1 < N && predictionsLevel2[pEnd + 1].equals("I-" + p)) ++pEnd;
                }
                if (labels[i].startsWith("B-")) {
                    l = labels[i].substring(2);
                    lEnd = i;
                    while (lEnd + 1 < N && labels[lEnd + 1].equals("I-" + l)) ++lEnd;
                }
                if (!p.equals("O") || !l.equals("O")) {
                    if (pEnd == lEnd)
                        resultsPhraseLevel2.reportPrediction(p, l);
                    else {
                        if (!p.equals("O"))
                            resultsPhraseLevel2.reportPrediction(p, "O");
                        if (!l.equals("O"))
                            resultsPhraseLevel2.reportPrediction("O", l);
                    }
                }
            }
        }
    }
    TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, dataCloneWithanonymizedLabels, NEWord.LabelToLookAt.GoldLabel);
    TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, dataCloneWithanonymizedLabels, NEWord.LabelToLookAt.PredictionLevel2Tagger);
    for (int docid = 0; docid < dataCloneWithanonymizedLabels.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = dataCloneWithanonymizedLabels.documents.get(docid).sentences;
        for (LinkedVector sentence : sentences) for (int j = 0; j < sentence.size(); j++) {
            NEWord w = (NEWord) sentence.get(j);
            String bracketTypePrediction = w.neTypeLevel2;
            if (bracketTypePrediction.indexOf('-') > 0)
                bracketTypePrediction = bracketTypePrediction.substring(0, 1);
            String bracketTypeLabel = w.neLabel;
            if (bracketTypeLabel.indexOf('-') > 0)
                bracketTypeLabel = bracketTypeLabel.substring(0, 1);
            resultsByBILOU.reportPrediction(w.neTypeLevel2, w.neLabel);
            resultsSegmentation.reportPrediction(bracketTypePrediction, bracketTypeLabel);
        }
    }
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList)

Example 10 with Word

use of edu.illinois.cs.cogcomp.lbjava.nlp.Word in project cogcomp-nlp by CogComp.

the class NEWord method splitWord.

/*
     * Used for some tokenization schemes.
     */
private static Vector<NEWord> splitWord(NEWord word) {
    String[] sentence = { word.form + " " };
    Parser parser = new WordSplitter(new SentenceSplitter(sentence));
    LinkedVector words = (LinkedVector) parser.next();
    Vector<NEWord> res = new Vector<>();
    if (words == null) {
        res.add(word);
        return res;
    }
    String label = word.neLabel;
    for (int i = 0; i < words.size(); i++) {
        if (label.contains("B-") && i > 0)
            label = "I-" + label.substring(2);
        NEWord w = new NEWord(new Word(((Word) words.get(i)).form), null, label);
        res.addElement(w);
    }
    return res;
}
Also used : Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) WordSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.WordSplitter) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Parser(edu.illinois.cs.cogcomp.lbjava.parse.Parser)

Aggregations

Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)15 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)9 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)5 WordSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.WordSplitter)3 Parser (edu.illinois.cs.cogcomp.lbjava.parse.Parser)3 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)2 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)2 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)2 ArrayList (java.util.ArrayList)2 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 Classifier (edu.illinois.cs.cogcomp.lbjava.classify.Classifier)1 PlainToTokenParser (edu.illinois.cs.cogcomp.lbjava.nlp.seg.PlainToTokenParser)1 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)1 POSTagger (edu.illinois.cs.cogcomp.pos.lbjava.POSTagger)1 BufferedReader (java.io.BufferedReader)1 FileNotFoundException (java.io.FileNotFoundException)1 HashSet (java.util.HashSet)1 LinkedList (java.util.LinkedList)1