use of java.io.FileNotFoundException in project CoreNLP by stanfordnlp.
the class ATBArabicDataset method build.
public void build() {
for (File path : pathsToData) {
if (splitFilter == null) {
treebank.loadPath(path, treeFileExtension, false);
} else {
treebank.loadPath(path, splitFilter);
}
}
PrintWriter outfile = null;
PrintWriter flatFile = null;
try {
outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")));
flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName), "UTF-8"))) : null;
treebank.apply(new ArabicRawTreeNormalizer(outfile, flatFile));
outputFileList.add(outFileName);
if (makeFlatFile) {
outputFileList.add(flatFileName);
toStringBuffer.append(" Made flat files\n");
}
} catch (UnsupportedEncodingException e) {
System.err.printf("%s: Filesystem does not support UTF-8 output\n", this.getClass().getName());
e.printStackTrace();
} catch (FileNotFoundException e) {
System.err.printf("%s: Could not open %s for writing\n", this.getClass().getName(), outFileName);
} finally {
if (outfile != null)
outfile.close();
if (flatFile != null)
flatFile.close();
}
}
use of java.io.FileNotFoundException in project CoreNLP by stanfordnlp.
the class ArabicSegmenter method evaluate.
/**
* Evaluate accuracy when the input is gold segmented text *with* segmentation
* markers and morphological analyses. In other words, the evaluation file has the
* same format as the training data.
*
* @param pwOut
*/
private void evaluate(PrintWriter pwOut) {
log.info("Starting evaluation...");
boolean hasSegmentationMarkers = true;
boolean hasTags = true;
DocumentReaderAndWriter<CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf);
ObjectBank<List<CoreLabel>> lines = classifier.makeObjectBankFromFile(flags.testFile, docReader);
PrintWriter tedEvalGoldTree = null, tedEvalParseTree = null;
PrintWriter tedEvalGoldSeg = null, tedEvalParseSeg = null;
if (tedEvalPrefix != null) {
try {
tedEvalGoldTree = new PrintWriter(tedEvalPrefix + "_gold.ftree");
tedEvalGoldSeg = new PrintWriter(tedEvalPrefix + "_gold.segmentation");
tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree");
tedEvalParseSeg = new PrintWriter(tedEvalPrefix + "_parse.segmentation");
} catch (FileNotFoundException e) {
System.err.printf("%s: %s%n", ArabicSegmenter.class.getName(), e.getMessage());
}
}
Counter<String> labelTotal = new ClassicCounter<>();
Counter<String> labelCorrect = new ClassicCounter<>();
int total = 0;
int correct = 0;
for (List<CoreLabel> line : lines) {
final String[] inputTokens = tedEvalSanitize(IOBUtils.IOBToString(line).replaceAll(":", "#pm#")).split(" ");
final String[] goldTokens = tedEvalSanitize(IOBUtils.IOBToString(line, ":")).split(" ");
line = classifier.classify(line);
final String[] parseTokens = tedEvalSanitize(IOBUtils.IOBToString(line, ":")).split(" ");
for (CoreLabel label : line) {
// Do not evaluate labeling of whitespace
String observation = label.get(CoreAnnotations.CharAnnotation.class);
if (!observation.equals(IOBUtils.getBoundaryCharacter())) {
total++;
String hypothesis = label.get(CoreAnnotations.AnswerAnnotation.class);
String reference = label.get(CoreAnnotations.GoldAnswerAnnotation.class);
labelTotal.incrementCount(reference);
if (hypothesis.equals(reference)) {
correct++;
labelCorrect.incrementCount(reference);
}
}
}
if (tedEvalParseSeg != null) {
tedEvalGoldTree.printf("(root");
tedEvalParseTree.printf("(root");
int safeLength = inputTokens.length;
if (inputTokens.length != goldTokens.length) {
log.info("In generating TEDEval files: Input and gold do not have the same number of tokens");
log.info(" (ignoring any extras)");
log.info(" input: " + Arrays.toString(inputTokens));
log.info(" gold: " + Arrays.toString(goldTokens));
safeLength = Math.min(inputTokens.length, goldTokens.length);
}
if (inputTokens.length != parseTokens.length) {
log.info("In generating TEDEval files: Input and parse do not have the same number of tokens");
log.info(" (ignoring any extras)");
log.info(" input: " + Arrays.toString(inputTokens));
log.info(" parse: " + Arrays.toString(parseTokens));
safeLength = Math.min(inputTokens.length, parseTokens.length);
}
for (int i = 0; i < safeLength; i++) {
for (String segment : goldTokens[i].split(":")) tedEvalGoldTree.printf(" (seg %s)", segment);
tedEvalGoldSeg.printf("%s\t%s%n", inputTokens[i], goldTokens[i]);
for (String segment : parseTokens[i].split(":")) tedEvalParseTree.printf(" (seg %s)", segment);
tedEvalParseSeg.printf("%s\t%s%n", inputTokens[i], parseTokens[i]);
}
tedEvalGoldTree.printf(")%n");
tedEvalGoldSeg.println();
tedEvalParseTree.printf(")%n");
tedEvalParseSeg.println();
}
}
double accuracy = ((double) correct) / ((double) total);
accuracy *= 100.0;
pwOut.println("EVALUATION RESULTS");
pwOut.printf("#datums:\t%d%n", total);
pwOut.printf("#correct:\t%d%n", correct);
pwOut.printf("accuracy:\t%.2f%n", accuracy);
pwOut.println("==================");
// Output the per label accuracies
pwOut.println("PER LABEL ACCURACIES");
for (String refLabel : labelTotal.keySet()) {
double nTotal = labelTotal.getCount(refLabel);
double nCorrect = labelCorrect.getCount(refLabel);
double acc = (nCorrect / nTotal) * 100.0;
pwOut.printf(" %s\t%.2f%n", refLabel, acc);
}
if (tedEvalParseSeg != null) {
tedEvalGoldTree.close();
tedEvalGoldSeg.close();
tedEvalParseTree.close();
tedEvalParseSeg.close();
}
}
use of java.io.FileNotFoundException in project CoreNLP by stanfordnlp.
the class FTBDataset method makeSplitSet.
private Set<String> makeSplitSet(String splitFileName) {
splitFileName = DataFilePaths.convert(splitFileName);
Set<String> splitSet = Generics.newHashSet();
LineNumberReader reader = null;
try {
reader = new LineNumberReader(new FileReader(splitFileName));
for (String line; (line = reader.readLine()) != null; ) {
splitSet.add(line.trim());
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
System.err.printf("%s: Error reading %s (line %d)%n", this.getClass().getName(), splitFileName, reader.getLineNumber());
e.printStackTrace();
}
return splitSet;
}
use of java.io.FileNotFoundException in project CoreNLP by stanfordnlp.
the class ConfusionMatrixTSV method main.
public static void main(String[] args) {
if (args.length < 1) {
System.err.printf("Usage: java %s answers_file%n", ConfusionMatrix.class.getName());
System.exit(-1);
}
try {
ConfusionMatrix<String> cm = new ConfusionMatrix<>();
String answersFile = args[0];
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(answersFile), "UTF-8"));
String line = br.readLine();
for (; line != null; line = br.readLine()) {
String[] tokens = line.split("\\s");
if (tokens.length != 3) {
System.err.printf("ignoring bad line");
continue;
//System.exit(-1);
}
cm.add(tokens[2], tokens[1]);
}
System.out.println(cm.toString());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of java.io.FileNotFoundException in project CoreNLP by stanfordnlp.
the class TreeToTSV method main.
public static void main(String[] args) {
if (args.length < 1) {
System.err.printf("Usage: java %s tree_file%n", TreeToTSV.class.getName());
System.exit(-1);
}
String treeFile = args[0];
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
StringBuilder sb = new StringBuilder();
String nl = System.getProperty("line.separator");
Pattern nePattern = Pattern.compile("^grup\\.nom\\.");
Pattern npPattern = Pattern.compile("^np0000.$");
for (Tree tree; (tree = tr.readTree()) != null; ) {
for (Tree t : tree) {
if (!t.isPreTerminal())
continue;
char type = 'O';
Tree grandma = t.ancestor(1, tree);
String grandmaValue = ((CoreLabel) grandma.label()).value();
// grup.nom.x
if (nePattern.matcher(grandmaValue).find())
type = grandmaValue.charAt(9);
else // else check the pos for np0000x or not
{
String pos = ((CoreLabel) t.label()).value();
if (npPattern.matcher(pos).find())
type = pos.charAt(6);
}
Tree wordNode = t.firstChild();
String word = ((CoreLabel) wordNode.label()).value();
sb.append(word).append("\t");
switch(type) {
case 'p':
sb.append("PERS");
break;
case 'l':
sb.append("LUG");
break;
case 'o':
sb.append("ORG");
break;
case '0':
sb.append("OTROS");
break;
default:
sb.append("O");
}
sb.append(nl);
}
sb.append(nl);
}
System.out.print(sb.toString());
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations