Search in sources :

Example 1 with PennTreebankPOSReader

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.PennTreebankPOSReader in project cogcomp-nlp by CogComp.

the class POSBaseLineCounter method buildTableHelper.

/**
     * A table is built from a given source corpus file by simply counting the number of times that
     * each form-POS association appear in a source corpus.
     * 
     * @param fileName file name of the source corpus
     * @throws Exception
     **/
private void buildTableHelper(String fileName) {
    PennTreebankPOSReader reader = new PennTreebankPOSReader(this.corpusName);
    reader.readFile(fileName);
    List<TextAnnotation> tas = reader.getTextAnnotations();
    for (TextAnnotation ta : tas) {
        for (int tokenId = 0; tokenId < ta.size(); tokenId++) {
            count(ta.getToken(tokenId), ((SpanLabelView) ta.getView(ViewNames.POS)).getLabel(tokenId));
        }
    }
}
Also used : PennTreebankPOSReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.PennTreebankPOSReader) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 2 with PennTreebankPOSReader

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.PennTreebankPOSReader in project cogcomp-nlp by CogComp.

the class POSMikheevCounter method buildTableHelper.

/**
     * A table is built from a given source corpus file by counting the number of times that each
     * suffix-POS association in a source corpus.
     * 
     * @param fileName file name of the source corpus
     * @throws Exception
     **/
private void buildTableHelper(String fileName) throws Exception {
    PennTreebankPOSReader reader = new PennTreebankPOSReader(this.corpusName);
    reader.readFile(fileName);
    List<TextAnnotation> tas = reader.getTextAnnotations();
    for (TextAnnotation ta : tas) {
        for (int tokenId = 0; tokenId < ta.size(); tokenId++) {
            String form = ta.getToken(tokenId);
            String tag = ((SpanLabelView) ta.getView(ViewNames.POS)).getLabel(tokenId);
            if (form.length() >= 5) {
                boolean allLetters = true;
                for (int i = form.length() - 3; i < form.length() && allLetters; ++i) allLetters = Character.isLetter(form.charAt(i));
                if (allLetters) {
                    // Word w = (Word) example;
                    HashMap<String, TreeMap<String, Integer>> t = null;
                    if (WordHelpers.isCapitalized(ta, tokenId)) {
                        int headOfSentence = ta.getSentence(ta.getSentenceId(tokenId)).getStartSpan();
                        if (tokenId == headOfSentence)
                            t = firstCapitalized;
                        else
                            t = notFirstCapitalized;
                    } else {
                        if (form.contains("-"))
                            return;
                        t = table;
                    }
                    form = form.toLowerCase();
                    count(t, form.substring(form.length() - 3), tag);
                    if (form.length() >= 6 && Character.isLetter(form.charAt(form.length() - 4)))
                        count(t, form.substring(form.length() - 4), tag);
                }
            }
        }
    }
}
Also used : PennTreebankPOSReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.PennTreebankPOSReader) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) SpanLabelView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView) TreeMap(java.util.TreeMap)

Aggregations

TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)2 PennTreebankPOSReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.PennTreebankPOSReader)2 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)1 TreeMap (java.util.TreeMap)1