use of edu.stanford.nlp.patterns.DataInstance in project CoreNLP by stanfordnlp.
the class AnnotatedTextReader method parseColumnFile.
public static Map<String, DataInstance> parseColumnFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) {
CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
Properties props = new Properties();
SeqClassifierFlags flags = new SeqClassifierFlags(props);
flags.entitySubclassification = "noprefix";
flags.retainEntitySubclassification = false;
conllreader.init(flags);
Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);
;
int num = -1;
Map<String, DataInstance> sents = new HashMap<>();
while (dociter.hasNext()) {
List<CoreLabel> doc = dociter.next();
List<String> words = new ArrayList<>();
List<CoreLabel> sentcore = new ArrayList<>();
int tokenindex = 0;
for (CoreLabel l : doc) {
if (l.word().equals(CoNLLDocumentReaderAndWriter.BOUNDARY) || l.word().equals("-DOCSTART-")) {
if (words.size() > 0) {
num++;
String docid = sentIDprefix + "-" + String.valueOf(num);
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
words = new ArrayList<>();
sentcore = new ArrayList<>();
tokenindex = 0;
}
continue;
}
tokenindex++;
words.add(l.word());
l.set(CoreAnnotations.IndexAnnotation.class, tokenindex);
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
String label = l.get(CoreAnnotations.AnswerAnnotation.class);
assert label != null : "label cannot be null";
l.set(CoreAnnotations.TextAnnotation.class, l.word());
l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
if (setGoldClass) {
l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
}
if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label))
l.set(setClassForTheseLabels.get(label), label);
sentcore.add(l);
}
if (words.size() > 0) {
num++;
String docid = sentIDprefix + "-" + String.valueOf(num);
;
DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
sents.put(docid, sentInst);
}
}
return sents;
}
use of edu.stanford.nlp.patterns.DataInstance in project CoreNLP by stanfordnlp.
the class LearnImportantFeatures method sample.
// public void getDecisionTree(Map<String, List<CoreLabel>> sents,
// List<Pair<String, Integer>> chosen, Counter<String> weights, String
// wekaOptions) {
// RVFDataset<String, String> dataset = new RVFDataset<String, String>();
// for (Pair<String, Integer> d : chosen) {
// CoreLabel l = sents.get(d.first).get(d.second());
// String w = l.word();
// Integer num = this.clusterIds.get(w);
// if (num == null)
// num = -1;
// double wt = weights.getCount("Cluster-" + num);
// String label;
// if (l.get(answerClass).toString().equals(answerLabel))
// label = answerLabel;
// else
// label = "O";
// Counter<String> feat = new ClassicCounter<String>();
// feat.setCount("DIST", wt);
// dataset.add(new RVFDatum<String, String>(feat, label));
// }
// WekaDatumClassifierFactory wekaFactory = new
// WekaDatumClassifierFactory("weka.classifiers.trees.J48", wekaOptions);
// WekaDatumClassifier classifier = wekaFactory.trainClassifier(dataset);
// Classifier cls = classifier.getClassifier();
// J48 j48decisiontree = (J48) cls;
// System.out.println(j48decisiontree.toSummaryString());
// System.out.println(j48decisiontree.toString());
//
// }
private int sample(Map<String, DataInstance> sents, Random r, Random rneg, double perSelectNeg, double perSelectRand, int numrand, List<Pair<String, Integer>> chosen, RVFDataset<String, String> dataset) {
for (Entry<String, DataInstance> en : sents.entrySet()) {
CoreLabel[] sent = en.getValue().getTokens().toArray(new CoreLabel[0]);
for (int i = 0; i < sent.length; i++) {
CoreLabel l = sent[i];
boolean chooseThis = false;
if (l.get(answerClass).equals(answerLabel)) {
chooseThis = true;
} else if ((!l.get(answerClass).equals("O") || negativeWords.contains(l.word().toLowerCase())) && getRandomBoolean(r, perSelectNeg)) {
chooseThis = true;
} else if (getRandomBoolean(r, perSelectRand)) {
numrand++;
chooseThis = true;
} else
chooseThis = false;
if (chooseThis) {
chosen.add(new Pair(en.getKey(), i));
RVFDatum<String, String> d = getDatum(sent, i);
dataset.add(d, en.getKey(), Integer.toString(i));
}
}
}
return numrand;
}
Aggregations