Search in sources :

Example 21 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class WordTopicAndLayoutFeatures method addDatasets.

/*
     * Note- this assumes that the data is split by documents. So if we choose to ignore the
     * document boundaries, we're in trouble!!!
     */
public static void addDatasets(Vector<LinkedVector> sentences, boolean lowercaseData, double confidenceThreshold) throws Exception {
    if (nb == null || map == null)
        throw new Exception("Topic classifier not initialized!!!");
    String documentText = "";
    Vector<NEWord> docWords = new Vector<>();
    for (int sid = 0; sid < sentences.size(); sid++) {
        LinkedVector s = sentences.elementAt(sid);
        for (int i = 0; i < s.size(); i++) {
            documentText += " " + ((NEWord) s.get(i)).originalForm + " ";
            docWords.addElement((NEWord) s.get(i));
        }
        if (((NEWord) s.get(s.size() - 1)).nextIgnoreSentenceBoundary == null) {
            // this is the last sentence in the document- move on!
            if (lowercaseData)
                documentText = documentText.toLowerCase();
            Document doc = new Document(InFile.tokenize(documentText, "\n\t -.,?<>;':\"[]{}\\|`~!@#$%^&*()_+=-0987654321`~"), -1);
            int label = nb.classify(doc, confidenceThreshold);
            logger.info("*********************\n" + labelnames[label + 1] + "\n*********************\n" + documentText.substring(0, Math.min(documentText.length(), 400)));
            for (int i = 0; i < docWords.size(); i++) wordToTopicIdMap.put(docWords.elementAt(i), label);
            documentText = "";
            docWords = new Vector<>();
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Vector(java.util.Vector) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 22 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class ReferenceUtils method createNerDataStructuresForText.

public Data createNerDataStructuresForText(TextAnnotation ta) {
    ArrayList<LinkedVector> sentences = new ArrayList<>();
    String[] tokens = ta.getTokens();
    int[] tokenindices = new int[tokens.length];
    int tokenIndex = 0;
    int neWordIndex = 0;
    for (int i = 0; i < ta.getNumberOfSentences(); i++) {
        Sentence sentence = ta.getSentence(i);
        String[] wtoks = sentence.getTokens();
        LinkedVector words = new LinkedVector();
        for (String w : wtoks) {
            if (w.length() > 0) {
                NEWord.addTokenToSentence(words, w, "unlabeled");
                tokenindices[neWordIndex] = tokenIndex;
                neWordIndex++;
            } else {
                throw new IllegalStateException("Bad (zero length) token.");
            }
            tokenIndex++;
        }
        if (words.size() > 0)
            sentences.add(words);
    }
    // Do the annotation.
    Data data = new Data(new NERDocument(sentences, "input"));
    return data;
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) ArrayList(java.util.ArrayList) Data(edu.illinois.cs.cogcomp.ner.LbjTagger.Data) NERDocument(edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument) Sentence(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)

Example 23 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class TaggedDataWriter method toColumnsFormat.

private static String toColumnsFormat(Data data, NEWord.LabelToLookAt labelType) {
    StringBuilder res = new StringBuilder(data.documents.size() * 1000);
    for (int did = 0; did < data.documents.size(); did++) {
        for (int i = 0; i < data.documents.get(did).sentences.size(); i++) {
            LinkedVector vector = data.documents.get(did).sentences.get(i);
            if (((NEWord) vector.get(0)).previousIgnoreSentenceBoundary == null)
                res.append("O	0	0	O	-X-	-DOCSTART-	x	x	0\n\n");
            for (int j = 0; j < vector.size(); j++) {
                NEWord w = (NEWord) vector.get(j);
                res.append(w.getPrediction(labelType)).append("\t0\t").append(j).append("\tO\tO\t").append(w.form).append("\tx\tx\t0\n");
            }
            res.append("\n");
        }
    }
    return res.toString();
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 24 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class StatefullTokenizerTest method testWhitespaceBehavior.

/**
     * Test Splitter behavior on text with leading/trailing whitespace. Example is use case where
     * xml markup has been replaced with whitespace of equal span.
     */
@Test
public void testWhitespaceBehavior() {
    String origText = null;
    try {
        origText = LineIO.slurp(INFILE);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
    Pattern xmlTagPattern = Pattern.compile("(<[^>\\r\\n]+>)");
    Matcher xmlMatcher = xmlTagPattern.matcher(origText);
    StringBuilder cleanTextBldr = new StringBuilder();
    int lastAppendedCharOffset = 0;
    while (xmlMatcher.find()) {
        int start = xmlMatcher.start();
        int end = xmlMatcher.end();
        cleanTextBldr.append(origText.substring(lastAppendedCharOffset, start));
        for (int i = start; i < end; ++i) cleanTextBldr.append(" ");
        lastAppendedCharOffset = end;
    }
    cleanTextBldr.append(origText.substring(lastAppendedCharOffset));
    String cleanText = cleanTextBldr.toString();
    // count whitespace chars in string
    // check token offsets in tokens returned by SentenceSplitter
    Pattern sun = Pattern.compile("\\w*Sun\\w*");
    Matcher sunMatcher = sun.matcher(cleanText);
    Set<IntPair> sunSpans = new HashSet<>();
    while (sunMatcher.find()) sunSpans.add(new IntPair(sunMatcher.start(), sunMatcher.end()));
    SentenceSplitter splitter = new SentenceSplitter(new String[] { cleanText });
    Sentence[] sents = splitter.splitAll();
    Sentence s = sents[0];
    LinkedVector words = s.wordSplit();
    for (int i = 0; i < words.size(); ++i) {
        Word firstWord = (Word) words.get(0);
        if ("Sun".equals(firstWord.form)) {
            IntPair tokenCharOffsets = new IntPair(firstWord.start, firstWord.end);
            assertTrue(sunSpans.contains(tokenCharOffsets));
        }
    }
    StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
    Tokenizer.Tokenization tokenInfo = statefulTokenizer.tokenizeTextSpan(cleanText);
    assertEquals(tokenInfo.getCharacterOffsets().length, tokenInfo.getTokens().length);
    for (int i = 0; i < tokenInfo.getTokens().length; ++i) {
        String tok = tokenInfo.getTokens()[i];
        if (tok.equals("Sun")) {
            IntPair tokCharOffsets = tokenInfo.getCharacterOffsets()[i];
            if (!sunSpans.contains(tokCharOffsets)) {
                String origTextSubstring = cleanText.substring(tokCharOffsets.getFirst(), tokCharOffsets.getSecond());
                System.err.println("ERROR: tokenizer has form '" + tok + "', but offsets refer to substring '" + origTextSubstring + "'.");
            }
            assertTrue(sunSpans.contains(tokCharOffsets));
        }
    }
    TextAnnotation statefulTa = new TextAnnotation("test", "test", cleanText, tokenInfo.getCharacterOffsets(), tokenInfo.getTokens(), tokenInfo.getSentenceEndTokenIndexes());
    assertNotNull(statefulTa);
}
Also used : Pattern(java.util.regex.Pattern) Word(edu.illinois.cs.cogcomp.lbjava.nlp.Word) SentenceSplitter(edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter) LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Matcher(java.util.regex.Matcher) FileNotFoundException(java.io.FileNotFoundException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Sentence(edu.illinois.cs.cogcomp.lbjava.nlp.Sentence) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 25 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class NETagPlain method tagData.

public static String tagData(Data data, NETaggerLevel1 tagger1, NETaggerLevel2 tagger2) throws Exception {
    ExpressiveFeaturesAnnotator.annotate(data);
    Decoder.annotateDataBIO(data, tagger1, tagger2);
    StringBuffer res = new StringBuffer();
    for (int docid = 0; docid < data.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
        for (LinkedVector vector : sentences) {
            boolean open = false;
            String[] predictions = new String[vector.size()];
            String[] words = new String[vector.size()];
            for (int j = 0; j < vector.size(); j++) {
                predictions[j] = ((NEWord) vector.get(j)).neTypeLevel2;
                words[j] = ((NEWord) vector.get(j)).form;
            }
            for (int j = 0; j < vector.size(); j++) {
                if (predictions[j].startsWith("B-") || (j > 0 && predictions[j].startsWith("I-") && (!predictions[j - 1].endsWith(predictions[j].substring(2))))) {
                    res.append("[").append(predictions[j].substring(2)).append(" ");
                    open = true;
                }
                res.append(words[j]).append(" ");
                if (open) {
                    boolean close = false;
                    if (j == vector.size() - 1) {
                        close = true;
                    } else {
                        if (predictions[j + 1].startsWith("B-"))
                            close = true;
                        if (predictions[j + 1].equals("O"))
                            close = true;
                        if (predictions[j + 1].indexOf('-') > -1 && (!predictions[j].endsWith(predictions[j + 1].substring(2))))
                            close = true;
                    }
                    if (close) {
                        // SWM: makes the output a little cleaner
                        String str_res = res.toString().trim();
                        res = new StringBuffer(str_res);
                        res.append("] ");
                        open = false;
                    }
                }
            }
        }
    }
    return res.toString();
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Aggregations

LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)46 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)9 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)9 ArrayList (java.util.ArrayList)8 Vector (java.util.Vector)8 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 File (java.io.File)3 HashMap (java.util.HashMap)3 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)2 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)2 OutFile (edu.illinois.cs.cogcomp.ner.IO.OutFile)2 Matcher (java.util.regex.Matcher)2 ChunkLabel (edu.illinois.cs.cogcomp.chunker.main.lbjava.ChunkLabel)1 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1