use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.
the class SplitCanditoTrees method readTrees.
static Map<String, Tree> readTrees(String[] filenames) throws IOException {
// TODO: perhaps we can just pass in CC_TAGSET and get rid of replacePOSTags
// need to test that
final TreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false);
Map<String, Tree> treeMap = Generics.newHashMap();
for (String filename : filenames) {
File file = new File(filename);
String canonicalFilename = file.getName().substring(0, file.getName().lastIndexOf('.'));
FrenchXMLTreeReader tr = (FrenchXMLTreeReader) trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO8859_1")));
Tree t = null;
int numTrees;
for (numTrees = 0; (t = tr.readTree()) != null; numTrees++) {
String id = canonicalFilename + "-" + ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
treeMap.put(id, t);
}
tr.close();
System.err.printf("%s: %d trees%n", file.getName(), numTrees);
}
return treeMap;
}
use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.
the class SpanishTokenizer method main.
/**
* A fast, rule-based tokenizer for Spanish based on AnCora.
* Performs punctuation splitting and light tokenization by default.
* <p>
* Currently, this tokenizer does not do line splitting. It assumes that the input
* file is delimited by the system line separator. The output will be equivalently
* delimited.
* </p>
*
* @param args
*/
public static void main(String[] args) {
final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
if (options.containsKey("help")) {
log.info(usage());
return;
}
// Lexer options
final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
String orthoOptions = options.containsKey("ancora") ? ANCORA_OPTIONS : "";
if (options.containsKey("options")) {
orthoOptions = orthoOptions.isEmpty() ? options.getProperty("options") : orthoOptions + ',' + options;
}
final boolean tokens = PropertiesUtils.getBool(options, "tokens", false);
if (!tokens) {
orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
}
tf.setOptions(orthoOptions);
// Other options
final String encoding = options.getProperty("encoding", "UTF-8");
final boolean toLower = PropertiesUtils.getBool(options, "lowerCase", false);
final Locale es = new Locale("es");
boolean onePerLine = PropertiesUtils.getBool(options, "onePerLine", false);
// Read the file from stdin
int nLines = 0;
int nTokens = 0;
final long startTime = System.nanoTime();
try {
Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding));
boolean printSpace = false;
while (tokenizer.hasNext()) {
++nTokens;
String word = tokenizer.next().word();
if (word.equals(SpanishLexer.NEWLINE_TOKEN)) {
++nLines;
System.out.println();
if (!onePerLine) {
printSpace = false;
}
} else {
String outputToken = toLower ? word.toLowerCase(es) : word;
if (onePerLine) {
System.out.println(outputToken);
} else {
if (printSpace) {
System.out.print(" ");
}
System.out.print(outputToken);
printSpace = true;
}
}
}
} catch (UnsupportedEncodingException e) {
throw new RuntimeIOException("Bad character encoding", e);
}
long elapsedTime = System.nanoTime() - startTime;
double linesPerSec = (double) nLines / (elapsedTime / 1e9);
System.err.printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
}
use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.
the class ConfusionMatrixTSV method main.
public static void main(String[] args) {
if (args.length < 1) {
System.err.printf("Usage: java %s answers_file%n", ConfusionMatrix.class.getName());
System.exit(-1);
}
try {
ConfusionMatrix<String> cm = new ConfusionMatrix<>();
String answersFile = args[0];
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(answersFile), "UTF-8"));
String line = br.readLine();
for (; line != null; line = br.readLine()) {
String[] tokens = line.split("\\s");
if (tokens.length != 3) {
System.err.printf("ignoring bad line");
continue;
//System.exit(-1);
}
cm.add(tokens[2], tokens[1]);
}
System.out.println(cm.toString());
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.
the class TreeToTSV method main.
public static void main(String[] args) {
if (args.length < 1) {
System.err.printf("Usage: java %s tree_file%n", TreeToTSV.class.getName());
System.exit(-1);
}
String treeFile = args[0];
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
TreeReaderFactory trf = new SpanishTreeReaderFactory();
TreeReader tr = trf.newTreeReader(br);
StringBuilder sb = new StringBuilder();
String nl = System.getProperty("line.separator");
Pattern nePattern = Pattern.compile("^grup\\.nom\\.");
Pattern npPattern = Pattern.compile("^np0000.$");
for (Tree tree; (tree = tr.readTree()) != null; ) {
for (Tree t : tree) {
if (!t.isPreTerminal())
continue;
char type = 'O';
Tree grandma = t.ancestor(1, tree);
String grandmaValue = ((CoreLabel) grandma.label()).value();
// grup.nom.x
if (nePattern.matcher(grandmaValue).find())
type = grandmaValue.charAt(9);
else // else check the pos for np0000x or not
{
String pos = ((CoreLabel) t.label()).value();
if (npPattern.matcher(pos).find())
type = pos.charAt(6);
}
Tree wordNode = t.firstChild();
String word = ((CoreLabel) wordNode.label()).value();
sb.append(word).append("\t");
switch(type) {
case 'p':
sb.append("PERS");
break;
case 'l':
sb.append("LUG");
break;
case 'o':
sb.append("ORG");
break;
case '0':
sb.append("OTROS");
break;
default:
sb.append("O");
}
sb.append(nl);
}
sb.append(nl);
}
System.out.print(sb.toString());
tr.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.
the class AbstractBatchOptimizer method optimize.
public <T> ConcatVector optimize(T[] dataset, AbstractDifferentiableFunction<T> fn, ConcatVector initialWeights, double l2regularization, double convergenceDerivativeNorm, boolean quiet) {
if (!quiet)
log.info("\n**************\nBeginning training\n");
else
log.info("[Beginning quiet training]");
TrainingWorker<T> mainWorker = new TrainingWorker<>(dataset, fn, initialWeights, l2regularization, convergenceDerivativeNorm, quiet);
new Thread(mainWorker).start();
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
if (!quiet) {
log.info("NOTE: you can press any key (and maybe ENTER afterwards to jog stdin) to terminate learning early.");
log.info("The convergence criteria are quite aggressive if left uninterrupted, and will run for a while");
log.info("if left to their own devices.\n");
while (true) {
if (mainWorker.isFinished) {
log.info("training completed without interruption");
return mainWorker.weights;
}
try {
if (br.ready()) {
log.info("received quit command: quitting");
log.info("training completed by interruption");
mainWorker.isFinished = true;
return mainWorker.weights;
}
} catch (IOException e) {
e.printStackTrace();
}
}
} else {
while (!mainWorker.isFinished) {
synchronized (mainWorker.naturalTerminationBarrier) {
try {
mainWorker.naturalTerminationBarrier.wait();
} catch (InterruptedException e) {
throw new RuntimeInterruptedException(e);
}
}
}
log.info("[Quiet training complete]");
return mainWorker.weights;
}
}
Aggregations