Search in sources :

Example 41 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class BrownClusters method printOovData.

public final void printOovData(Data data) {
    HashMap<String, Boolean> tokensHash = new HashMap<>();
    HashMap<String, Boolean> tokensHashIC = new HashMap<>();
    ArrayList<LinkedVector> sentences = new ArrayList<>();
    for (int docid = 0; docid < data.documents.size(); docid++) for (int sid = 0; sid < data.documents.get(docid).sentences.size(); sid++) sentences.add(data.documents.get(docid).sentences.get(sid));
    for (LinkedVector sentence : sentences) for (int j = 0; j < sentence.size(); j++) {
        String form = ((NEWord) sentence.get(j)).form;
        tokensHash.put(form, true);
        tokensHashIC.put(form.toLowerCase(), true);
    }
    /*
         * System.out.println("Data statistics:");
         * System.out.println("\t\t- Total tokens with repetitions ="+ totalTokens);
         * System.out.println("\t\t- Total unique tokens  ="+ tokensHash.size());
         * System.out.println("\t\t- Total unique tokens ignore case ="+ tokensHashIC.size());
         */
    for (THashMap<String, String> wordToPath : wordToPathByResource) {
        HashMap<String, Boolean> oovCaseSensitiveHash = new HashMap<>();
        HashMap<String, Boolean> oovAfterLowercasingHash = new HashMap<>();
        for (LinkedVector sentence : sentences) {
            for (int j = 0; j < sentence.size(); j++) {
                String form = ((NEWord) sentence.get(j)).form;
                if (!wordToPath.containsKey(form)) {
                    oovCaseSensitiveHash.put(form, true);
                }
                if ((!wordToPath.containsKey(form)) && (!wordToPath.containsKey(form.toLowerCase()))) {
                    oovAfterLowercasingHash.put(form.toLowerCase(), true);
                }
            }
        }
    }
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) HashMap(java.util.HashMap) THashMap(gnu.trove.map.hash.THashMap) ArrayList(java.util.ArrayList) NEWord(edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)

Example 42 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class TwoLayerPredictionAggregationFeatures method setLevel1AggregationFeatures.

// are we aggregating to the right or to the left
public static void setLevel1AggregationFeatures(Data data, boolean useGoldData) {
    logger.debug("Extracting features for level 2 inference");
    for (int docid = 0; docid < data.documents.size(); docid++) {
        ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
        for (LinkedVector twords : sentences) {
            for (int j = 0; j < twords.size(); j++) {
                setLevel1AggregationFeatures((NEWord) twords.get(j), useGoldData);
            }
        }
    }
    logger.debug("Done - Extracting features for level 2 inference");
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Example 43 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class POSBracketToVector method parsePOSBracketForm.

/**
     * Given a single line of textual input (containing all and only the words
     * in a single sentence) in the format shown above, this method parses and
     * returns a <code>LinkedVector</code>.
     *
     * @param line A single line of text.
     * @return A <code>LinkedVector</code> representing the input text.
     **/
public static LinkedVector parsePOSBracketForm(String line) {
    String[] tokens = line.trim().split(" ");
    if (tokens.length == 0 || tokens.length == 1 && (tokens[0] == null || tokens[0].length() == 0))
        return new LinkedVector();
    int spaceIndex = line.indexOf(' ');
    spaceIndex = line.indexOf(' ', spaceIndex + 1);
    Word w = new Word(tokens[1].substring(0, tokens[1].length() - 1), tokens[0].substring(1), 0, spaceIndex - 1);
    for (int i = 2; i < tokens.length; i += 2) {
        int start = spaceIndex + 1;
        spaceIndex = line.indexOf(' ', spaceIndex + 1);
        spaceIndex = line.indexOf(' ', spaceIndex + 1);
        w.next = new Word(tokens[i + 1].substring(0, tokens[i + 1].length() - 1), tokens[i].substring(1), w, start, spaceIndex - 1);
        w = (Word) w.next;
    }
    return new LinkedVector(w);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Example 44 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class Sentence method wordSplit.

/**
     * Creates and returns a <code>LinkedVector</code> representation of this
     * sentence in which every <code>LinkedChild</code> is a <code>Word</code>.
     * Offset information is respected and propagated.
     *
     * @return A <code>LinkedVector</code> representation of this sentence.
     * @see Word
     **/
public LinkedVector wordSplit() {
    LinkedList<Integer> boundaries = new LinkedList<>();
    // Whitespace always signals a word boundary.
    Matcher m = pSpaces.matcher(text);
    while (m.find()) {
        myAdd(boundaries, m.start() - 1);
        myAdd(boundaries, m.end());
    }
    // there's whitespace there.
    if (boundaries.size() > 0 && boundaries.getLast() >= text.length())
        boundaries.removeLast();
    else
        myAdd(boundaries, text.length() - 1);
    if (boundaries.size() > 1 && boundaries.getFirst() == -1)
        boundaries.removeFirst();
    else
        myAdd(boundaries, 0);
    // Commas are separate words unless they're part of a number.
    for (int i = text.indexOf(','); i != -1; i = text.indexOf(',', i + 1)) {
        if (i > 0 && text.charAt(i - 1) != ',' && (pNoSpaceOrDigit.matcher(text.substring(i - 1, i)).find() || i + 1 == text.length() && pDigit.matcher(text.substring(i - 1, i)).find() || i + 1 < text.length() && pDigitCommaNoDigit.matcher(text.substring(i - 1, i + 2)).find())) {
            myAdd(boundaries, i - 1);
            myAdd(boundaries, i);
        }
        if (i + 1 < text.length() && (pNoSpaceOrDigit.matcher(text.substring(i + 1, i + 2)).find() || i == 0 && pDigit.matcher(text.substring(i + 1, i + 2)).find() || i > 0 && pNoDigitCommaDigit.matcher(text.substring(i - 1, i + 2)).find())) {
            myAdd(boundaries, i);
            myAdd(boundaries, i + 1);
        }
    }
    //    n't       "can't", "won't", "shouldn't", "aren't"
    for (int i = text.indexOf('\''); i != -1; i = text.indexOf('\'', i + 1)) {
        if (i - 1 > 0 && Character.isLetter(text.charAt(i - 2)) && text.charAt(i - 1) == 'n' && i + 1 < text.length() && text.charAt(i + 1) == 't' && (i + 2 == text.length() || !Character.isLetter(text.charAt(i + 2)) && text.charAt(i + 2) != '\'')) {
            myAdd(boundaries, i - 2);
            myAdd(boundaries, i - 1);
        } else if (i > 0 && (pApostropheMask.matcher(text.substring(i - 1, i)).find() && i + 1 < text.length() && text.charAt(i + 1) == '\'' || text.charAt(i - 1) == 's' && (i + 1 == text.length() || !Character.isLetter(text.charAt(i + 1)) && text.charAt(i + 1) != '\'') || Character.isLetter(text.charAt(i - 1)) && (i + 1 < text.length() && (i + 2 == text.length() || !Character.isLetter(text.charAt(i + 2)) && text.charAt(i + 2) != '\'') && (text.charAt(i + 1) == 'd' || text.charAt(i + 1) == 'm' || text.charAt(i + 1) == 's') || i + 2 < text.length() && (i + 3 == text.length() || !Character.isLetter(text.charAt(i + 3)) && text.charAt(i + 3) != '\'') && (text.substring(i + 1, i + 3).equals("ll") || text.substring(i + 1, i + 3).equals("re") || text.substring(i + 1, i + 3).equals("ve"))) || text.charAt(i - 1) == '.' && i - 1 > 0 && Character.isLetter(text.charAt(i - 2)) && i + 1 < text.length() && (i + 2 == text.length() || !Character.isLetter(text.charAt(i + 2)) && text.charAt(i + 2) != '\'') && text.charAt(i + 1) == 's')) {
            myAdd(boundaries, i - 1);
            myAdd(boundaries, i);
        }
        if (i + 1 < text.length() && pApostropheMask.matcher(text.substring(i + 1, i + 2)).find() && (!Character.isLetter(text.charAt(i + 1)) || i > 0 && text.charAt(i - 1) == '\'')) {
            myAdd(boundaries, i);
            myAdd(boundaries, i + 1);
        }
    }
    // URL.
    for (int i = text.indexOf(':'); i != -1; i = text.indexOf(':', i + 1)) if (!(i >= 2 && i + 2 < text.length() && pColonSeparator.matcher(text.substring(i - 2, i + 3)).find() || i > 2 && i + 2 < text.length() && (text.substring(i - 2, i + 3).equals("tp://") || text.substring(i - 2, i + 3).equals("TP://")) || partOfURL(i))) {
        if (i >= 1 && pColonMask.matcher(text.substring(i - 1, i)).find()) {
            myAdd(boundaries, i - 1);
            myAdd(boundaries, i);
        }
        if (i + 1 < text.length() && pColonMask.matcher(text.substring(i + 1, i + 2)).find()) {
            myAdd(boundaries, i);
            myAdd(boundaries, i + 1);
        }
    }
    // URL.
    for (int i = text.indexOf('/'); i != -1; i = text.indexOf('/', i + 1)) if (!(i >= 2 && i + 2 < text.length() && pSlashSeparator.matcher(text.substring(i - 2, i + 3)).find() || i > 3 && i + 1 < text.length() && (text.substring(i - 3, i + 2).equals("tp://") || text.substring(i - 3, i + 2).equals("TP://")) || i > 4 && (text.substring(i - 4, i + 1).equals("tp://") || text.substring(i - 4, i + 1).equals("TP://")) || partOfURL(i))) {
        if (i >= 1 && pSlashMask.matcher(text.substring(i - 1, i)).find()) {
            myAdd(boundaries, i - 1);
            myAdd(boundaries, i);
        }
        if (i + 1 < text.length() && pSlashMask.matcher(text.substring(i + 1, i + 2)).find()) {
            myAdd(boundaries, i);
            myAdd(boundaries, i + 1);
        }
    }
    // part of some useful structure like a compound word, a number, or a URL.
    for (int i = text.indexOf('-'); i != -1; i = text.indexOf('-', i + 1)) if (!(i + 1 < text.length() && i >= 1 && pDashSeparator.matcher(text.substring(i - 1, i + 2)).find() || (i + 2 < text.length() && (i == 0 && pNegative1.matcher(text.substring(i, i + 3)).find() || i > 0 && pNegative2.matcher(text.substring(i - 1, i + 3)).find())) || partOfURL(i))) {
        if (i >= 1 && pDashMask.matcher(text.substring(i - 1, i)).find()) {
            myAdd(boundaries, i - 1);
            myAdd(boundaries, i);
        }
        if (i + 1 < text.length() && pDashMask.matcher(text.substring(i + 1, i + 2)).find()) {
            myAdd(boundaries, i);
            myAdd(boundaries, i + 1);
        }
    }
    // a URL.
    for (int i = text.indexOf('$'); i != -1; i = text.indexOf('$', i + 1)) if (!(i == 0 && i + 2 < text.length() && pMoney1.matcher(text.substring(i, i + 3)).find() || i > 0 && i + 2 < text.length() && pMoney2.matcher(text.substring(i - 1, i + 3)).find() || partOfURL(i))) {
        if (i >= 1 && pDollarMask.matcher(text.substring(i - 1, i)).find()) {
            myAdd(boundaries, i - 1);
            myAdd(boundaries, i);
        }
        if (i + 1 < text.length() && pDollarMask.matcher(text.substring(i + 1, i + 2)).find()) {
            myAdd(boundaries, i);
            myAdd(boundaries, i + 1);
        }
    }
    // Three or more consecutive periods form their own word.
    for (int i = text.indexOf('.'); i != -1; i = text.indexOf('.', i + 1)) {
        if (i > 0 && i + 2 < text.length() && pBeforeElipsis.matcher(text.substring(i - 1, i + 3)).find()) {
            myAdd(boundaries, i - 1);
            myAdd(boundaries, i);
        }
        if (i >= 2 && i + 1 < text.length() && pAfterElipsis.matcher(text.substring(i - 2, i + 2)).find()) {
            myAdd(boundaries, i);
            myAdd(boundaries, i + 1);
        }
    }
    // If the last occurrence of a period in the sentence comes after all
    // occurrences of letters and digits, it is an end of sentence marker
    // which constitutes its own word, unless it appears immediately after two
    // other periods.
    int period = text.lastIndexOf('.');
    if (period != -1) {
        boolean endOfSentence = true;
        for (int i = period + 1; i < text.length() && endOfSentence; ++i) endOfSentence = !Character.isLetterOrDigit(text.charAt(i));
        if (endOfSentence) {
            if (period >= 1 && (text.charAt(period - 1) != '.' || period == 1 || text.charAt(period - 2) != '.') && pDollarMask.matcher(text.substring(period - 1, period)).find()) {
                myAdd(boundaries, period - 1);
                myAdd(boundaries, period);
            }
            if (period + 1 < text.length() && (period == 0 || text.charAt(period - 1) != '.' || period == 1 || text.charAt(period - 2) != '.') && pDollarMask.matcher(text.substring(period + 1, period + 2)).find()) {
                myAdd(boundaries, period);
                myAdd(boundaries, period + 1);
            }
        } else
            period = -1;
    }
    // All other punctuation marks constitute their own words, unless they
    // appear immediately after themselves (consecutive identical punctuation
    // marks form a single word) or are part of a URL.
    m = pPunctuation.matcher(text);
    while (m.find()) if (!partOfURL(m.start())) {
        if (m.start() + 1 < text.length() && text.charAt(m.start()) != text.charAt(m.start() + 1) && m.start() + 1 != period && pPunctuation.matcher(text.substring(m.start() + 1, m.start() + 2)).find()) {
            myAdd(boundaries, m.start());
            myAdd(boundaries, m.start() + 1);
        }
    }
    m = pPunctuation2.matcher(text);
    while (m.find()) if (!partOfURL(m.start())) {
        myAdd(boundaries, m.start());
        myAdd(boundaries, m.start() + 1);
    }
    m = pPunctuation3.matcher(text);
    while (m.find()) if (!partOfURL(m.start())) {
        myAdd(boundaries, m.start());
        myAdd(boundaries, m.start() + 1);
    }
    // Now we just have to create the LinkedVector.
    Integer[] temp = boundaries.toArray(new Integer[boundaries.size()]);
    int[] I = new int[temp.length];
    for (int i = 0; i < I.length; ++i) I[i] = temp[i];
    Arrays.sort(I);
    Word w = new Word(text.substring(I[0], I[1] + 1), I[0] + start, I[1] + start);
    for (int i = 2; i < I.length; i += 2) {
        w.next = new Word(text.substring(I[i], I[i + 1] + 1), w, I[i] + start, I[i + 1] + start);
        w = (Word) w.next;
    }
    inURL = null;
    return new LinkedVector(w);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Matcher(java.util.regex.Matcher) LinkedList(java.util.LinkedList)

Example 45 with LinkedVector

use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.

the class StringArraysToWords method convert.

/**
     * Given an array of <code>String</code>s, this method creates a new
     * {@link LinkedVector} containing {@link Word}s.
     *
     * @param a An array of <code>String</code>s.
     * @return A {@link LinkedVector} of {@link Word}s corresponding to the
     * input <code>String</code>s.
     **/
public static LinkedVector convert(String[] a) {
    if (a == null)
        return null;
    if (a.length == 0)
        return new LinkedVector();
    Word w = new Word(a[0]);
    for (int i = 1; i < a.length; ++i) {
        w.next = new Word(a[i], null, w);
        w = (Word) w.next;
    }
    return new LinkedVector(w);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)

Aggregations

LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)46 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)9 NEWord (edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord)9 ArrayList (java.util.ArrayList)8 Vector (java.util.Vector)8 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 NERDocument (edu.illinois.cs.cogcomp.ner.LbjTagger.NERDocument)3 File (java.io.File)3 HashMap (java.util.HashMap)3 Sentence (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence)2 Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)2 OutFile (edu.illinois.cs.cogcomp.ner.IO.OutFile)2 Matcher (java.util.regex.Matcher)2 ChunkLabel (edu.illinois.cs.cogcomp.chunker.main.lbjava.ChunkLabel)1 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1