Search in sources :

Example 1 with Token

use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.

the class PreviousTags method classify.

public FeatureVector classify(Object __example) {
    Token word = (Token) __example;
    FeatureVector __result;
    __result = new FeatureVector();
    String __id;
    String __value;
    int i;
    Token w = word;
    for (i = 0; i > -2 && w.previous != null; --i) {
        w = (Token) w.previous;
    }
    for (; w != word; w = (Token) w.next) {
        if (Chunker.isTraining) {
            __id = "" + (i++);
            __value = w.label;
            __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
        } else {
            __id = "" + (i++);
            __value = __Chunker.discreteValue(w);
            __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
        }
    }
    return __result;
}
Also used : FeatureVector(edu.illinois.cs.cogcomp.lbjava.classify.FeatureVector) DiscretePrimitiveStringFeature(edu.illinois.cs.cogcomp.lbjava.classify.DiscretePrimitiveStringFeature) Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)

Example 2 with Token

use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.

the class SOPrevious method classify.

public FeatureVector classify(Object __example) {
    Token word = (Token) __example;
    FeatureVector __result;
    __result = new FeatureVector();
    String __id;
    String __value;
    int i;
    Token w = word;
    for (i = 0; i > -2 && w.previous != null; --i) {
        w = (Token) w.previous;
    }
    String[] tags = new String[3];
    String[] labels = new String[2];
    i = 0;
    for (; w != word; w = (Token) w.next) {
        tags[i] = __POSTagger.discreteValue(w);
        if (Chunker.isTraining) {
            labels[i] = w.label;
        } else {
            labels[i] = __Chunker.discreteValue(w);
        }
        i++;
    }
    tags[i] = __POSTagger.discreteValue(w);
    __id = "ll";
    __value = "" + (labels[0] + "_" + labels[1]);
    __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
    __id = "lt1";
    __value = "" + (labels[0] + "_" + tags[1]);
    __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
    __id = "lt2";
    __value = "" + (labels[1] + "_" + tags[2]);
    __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
    return __result;
}
Also used : FeatureVector(edu.illinois.cs.cogcomp.lbjava.classify.FeatureVector) DiscretePrimitiveStringFeature(edu.illinois.cs.cogcomp.lbjava.classify.DiscretePrimitiveStringFeature) Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)

Example 3 with Token

use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.

the class Reuters2003Parser method next.

/**
     * Produces the next object parsed from the input file; in this case, that object is guaranteed
     * to be a <code>LinkedVector</code> populated by <code>Token</code>s representing a sentence.
     **/
public Object next() {
    String[] line = (String[]) super.next();
    while (line != null && (line.length < 2 || line[4].equals("-X-"))) line = (String[]) super.next();
    if (line == null)
        return null;
    if (line[3].charAt(0) == 'I')
        line[3] = "B" + line[3].substring(1);
    Token t = new Token(new Word(line[5], line[4]), null, line[3]);
    String previous = line[3];
    for (line = (String[]) super.next(); line != null && line.length > 0; line = (String[]) super.next()) {
        if (line[3].charAt(0) == 'I' && !previous.endsWith(line[3].substring(2)))
            line[3] = "B" + line[3].substring(1);
        t.next = new Token(new Word(line[5], line[4]), t, line[3]);
        t = (Token) t.next;
        previous = line[3];
    }
    return new LinkedVector(t);
}
Also used : LinkedVector(edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector) Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)

Example 4 with Token

use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.

the class ChunkerAnnotator method addView.

@Override
public void addView(TextAnnotation record) throws AnnotatorException {
    if (!record.hasView(tokensfield) || !record.hasView(sentencesfield) || !record.hasView(posfield)) {
        String msg = "Record must be tokenized, sentence split, and POS-tagged first.";
        logger.error(msg);
        throw new AnnotatorException(msg);
    }
    List<Constituent> tags = record.getView(posfield).getConstituents();
    List<Token> lbjTokens = LBJavaUtils.recordToLBJTokens(record);
    View chunkView = new SpanLabelView(ViewNames.SHALLOW_PARSE, this.NAME, record, 1.0);
    int currentChunkStart = 0;
    int currentChunkEnd = 0;
    String clabel = "";
    Constituent previous = null;
    int tcounter = 0;
    for (Token lbjtoken : lbjTokens) {
        Constituent current = tags.get(tcounter);
        tagger.discreteValue(lbjtoken);
        logger.debug("{} {}", lbjtoken.toString(), (null == lbjtoken.type) ? "NULL" : lbjtoken.type);
        // what happens if we see an Inside tag -- even if it doesn't follow a Before tag
        if (null != lbjtoken.type && lbjtoken.type.charAt(0) == 'I') {
            if (lbjtoken.type.length() < 3)
                throw new IllegalArgumentException("Chunker word label '" + lbjtoken.type + "' is too short!");
            if (// we must have just seen an Outside tag and possibly completed
            null == clabel) // a chunk
            {
                // modify lbjToken.type for later ifs
                lbjtoken.type = "B" + lbjtoken.type.substring(1);
            } else if (clabel.length() >= 3 && !clabel.equals(lbjtoken.type.substring(2))) {
                // trying to avoid mysterious null pointer exception...
                lbjtoken.type = "B" + lbjtoken.type.substring(1);
            }
        }
        if ((lbjtoken.type.charAt(0) == 'B' || lbjtoken.type.charAt(0) == 'O') && clabel != null) {
            if (previous != null) {
                currentChunkEnd = previous.getEndSpan();
                Constituent label = new Constituent(clabel, ViewNames.SHALLOW_PARSE, record, currentChunkStart, currentChunkEnd);
                chunkView.addConstituent(label);
                clabel = null;
            }
        // else no chunk in progress (we are at the start of the doc)
        }
        if (lbjtoken.type.charAt(0) == 'B') {
            currentChunkStart = current.getStartSpan();
            clabel = lbjtoken.type.substring(2);
        }
        previous = current;
        tcounter++;
    }
    if (clabel != null && null != previous) {
        currentChunkEnd = previous.getEndSpan();
        Constituent label = new Constituent(clabel, ViewNames.SHALLOW_PARSE, record, currentChunkStart, currentChunkEnd);
        chunkView.addConstituent(label);
    }
    record.addView(ViewNames.SHALLOW_PARSE, chunkView);
    // chunkView;
    return;
}
Also used : AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)

Example 5 with Token

use of edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token in project cogcomp-nlp by CogComp.

the class suffixFeatures method classify.

public FeatureVector classify(Object __example) {
    Token w = (Token) __example;
    FeatureVector __result;
    __result = new FeatureVector();
    String __id;
    String __value;
    int length = w.form.length();
    boolean unknown = POSTaggerUnknown.isTraining && __baselineTarget.observedCount(w.form) <= POSLabeledUnknownWordParser.threshold || !POSTaggerUnknown.isTraining && __baselineTarget.discreteValue(w).equals("UNKNOWN");
    if (unknown && length > 3 && Character.isLetter(w.form.charAt(length - 1))) {
        __id = "" + (w.form.substring(length - 1).toLowerCase());
        __value = "true";
        __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
        if (Character.isLetter(w.form.charAt(length - 2))) {
            __id = "" + (w.form.substring(length - 2).toLowerCase());
            __value = "true";
            __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
            if (length > 4 && Character.isLetter(w.form.charAt(length - 3))) {
                __id = "" + (w.form.substring(length - 3).toLowerCase());
                __value = "true";
                __result.addFeature(new DiscretePrimitiveStringFeature(this.containingPackage, this.name, __id, __value, valueIndexOf(__value), (short) 0));
            }
        }
    }
    return __result;
}
Also used : Token(edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)

Aggregations

Token (edu.illinois.cs.cogcomp.lbjava.nlp.seg.Token)19 DiscretePrimitiveStringFeature (edu.illinois.cs.cogcomp.lbjava.classify.DiscretePrimitiveStringFeature)8 FeatureVector (edu.illinois.cs.cogcomp.lbjava.classify.FeatureVector)5 Parser (edu.illinois.cs.cogcomp.lbjava.parse.Parser)4 Feature (edu.illinois.cs.cogcomp.lbjava.classify.Feature)3 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)2 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)2 WordSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.WordSplitter)2 PlainToTokenParser (edu.illinois.cs.cogcomp.lbjava.nlp.seg.PlainToTokenParser)2 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)2 Test (org.junit.Test)2 Chunker (edu.illinois.cs.cogcomp.chunker.main.lbjava.Chunker)1 CoNLL2000Parser (edu.illinois.cs.cogcomp.chunker.utils.CoNLL2000Parser)1 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)1 Word (edu.illinois.cs.cogcomp.lbjava.nlp.Word)1 POSBracketToToken (edu.illinois.cs.cogcomp.lbjava.nlp.seg.POSBracketToToken)1 ChildrenFromVectors (edu.illinois.cs.cogcomp.lbjava.parse.ChildrenFromVectors)1 POSTagger (edu.illinois.cs.cogcomp.pos.lbjava.POSTagger)1 ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1