Search in sources :

Example 1 with DataInstance

use of edu.stanford.nlp.patterns.DataInstance in project CoreNLP by stanfordnlp.

the class AnnotatedTextReader method parseColumnFile.

public static Map<String, DataInstance> parseColumnFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) {
    CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
    Properties props = new Properties();
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    flags.entitySubclassification = "noprefix";
    flags.retainEntitySubclassification = false;
    conllreader.init(flags);
    Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);
    ;
    int num = -1;
    Map<String, DataInstance> sents = new HashMap<>();
    while (dociter.hasNext()) {
        List<CoreLabel> doc = dociter.next();
        List<String> words = new ArrayList<>();
        List<CoreLabel> sentcore = new ArrayList<>();
        int tokenindex = 0;
        for (CoreLabel l : doc) {
            if (l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")) {
                if (words.size() > 0) {
                    num++;
                    String docid = sentIDprefix + "-" + String.valueOf(num);
                    DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
                    sents.put(docid, sentInst);
                    words = new ArrayList<>();
                    sentcore = new ArrayList<>();
                    tokenindex = 0;
                }
                continue;
            }
            tokenindex++;
            words.add(l.word());
            l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
            l.set(CoreAnnotations.ValueAnnotation.class, l.word());
            String label = l.get(CoreAnnotations.AnswerAnnotation.class);
            assert label != null : "label cannot be null";
            l.set(CoreAnnotations.TextAnnotation.class, l.word());
            l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
            if (setGoldClass) {
                l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
            }
            if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
                l.set(setClassForTheseLabels.get(label), label);
            sentcore.add(l);
        }
        if (words.size() > 0) {
            num++;
            String docid = sentIDprefix + "-" + String.valueOf(num);
            ;
            DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
            sents.put(docid, sentInst);
        }
    }
    return sents;
}
Also used : DataInstance(edu.stanford.nlp.patterns.DataInstance) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoNLLDocumentReaderAndWriter(edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter)

Example 2 with DataInstance

use of edu.stanford.nlp.patterns.DataInstance in project CoreNLP by stanfordnlp.

the class LearnImportantFeatures method sample.

// public void getDecisionTree(Map<String, List<CoreLabel>> sents,
// List<Pair<String, Integer>> chosen, Counter<String> weights, String
// wekaOptions) {
// RVFDataset<String, String> dataset = new RVFDataset<String, String>();
// for (Pair<String, Integer> d : chosen) {
// CoreLabel l = sents.get(d.first).get(d.second());
// String w = l.word();
// Integer num = this.clusterIds.get(w);
// if (num == null)
// num = -1;
// double wt = weights.getCount("Cluster-" + num);
// String label;
// if (l.get(answerClass).toString().equals(answerLabel))
// label = answerLabel;
// else
// label = "O";
// Counter<String> feat = new ClassicCounter<String>();
// feat.setCount("DIST", wt);
// dataset.add(new RVFDatum<String, String>(feat, label));
// }
// WekaDatumClassifierFactory wekaFactory = new
// WekaDatumClassifierFactory("weka.classifiers.trees.J48", wekaOptions);
// WekaDatumClassifier classifier = wekaFactory.trainClassifier(dataset);
// Classifier cls = classifier.getClassifier();
// J48 j48decisiontree = (J48) cls;
// System.out.println(j48decisiontree.toSummaryString());
// System.out.println(j48decisiontree.toString());
//
// }
private int sample(Map<String, DataInstance> sents, Random r, Random rneg, double perSelectNeg, double perSelectRand, int numrand, List<Pair<String, Integer>> chosen, RVFDataset<String, String> dataset) {
    for (Entry<String, DataInstance> en : sents.entrySet()) {
        CoreLabel[] sent = en.getValue().getTokens().toArray(new CoreLabel[0]);
        for (int i = 0; i < sent.length; i++) {
            CoreLabel l = sent[i];
            boolean chooseThis = false;
            if (l.get(answerClass).equals(answerLabel)) {
                chooseThis = true;
            } else if ((!l.get(answerClass).equals("O") || negativeWords.contains(l.word().toLowerCase())) && getRandomBoolean(r, perSelectNeg)) {
                chooseThis = true;
            } else if (getRandomBoolean(r, perSelectRand)) {
                numrand++;
                chooseThis = true;
            } else
                chooseThis = false;
            if (chooseThis) {
                chosen.add(new Pair(en.getKey(), i));
                RVFDatum<String, String> d = getDatum(sent, i);
                dataset.add(d, en.getKey(), Integer.toString(i));
            }
        }
    }
    return numrand;
}
Also used : DataInstance(edu.stanford.nlp.patterns.DataInstance) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Pair(edu.stanford.nlp.util.Pair)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 DataInstance (edu.stanford.nlp.patterns.DataInstance)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 CoNLLDocumentReaderAndWriter (edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter)1 SeqClassifierFlags (edu.stanford.nlp.sequences.SeqClassifierFlags)1 Pair (edu.stanford.nlp.util.Pair)1