Search in sources :

Example 51 with InputStreamReader

use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.

the class SplitCanditoTrees method readTrees.

static Map<String, Tree> readTrees(String[] filenames) throws IOException {
    // TODO: perhaps we can just pass in CC_TAGSET and get rid of replacePOSTags
    // need to test that
    final TreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false);
    Map<String, Tree> treeMap = Generics.newHashMap();
    for (String filename : filenames) {
        File file = new File(filename);
        String canonicalFilename = file.getName().substring(0, file.getName().lastIndexOf('.'));
        FrenchXMLTreeReader tr = (FrenchXMLTreeReader) trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO8859_1")));
        Tree t = null;
        int numTrees;
        for (numTrees = 0; (t = tr.readTree()) != null; numTrees++) {
            String id = canonicalFilename + "-" + ((CoreLabel) t.label()).get(CoreAnnotations.SentenceIDAnnotation.class);
            treeMap.put(id, t);
        }
        tr.close();
        System.err.printf("%s: %d trees%n", file.getName(), numTrees);
    }
    return treeMap;
}
Also used : FrenchXMLTreeReader(edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader) InputStreamReader(java.io.InputStreamReader) FrenchXMLTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchXMLTreeReaderFactory) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) FrenchXMLTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchXMLTreeReaderFactory) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 52 with InputStreamReader

use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.

the class SpanishTokenizer method main.

/**
   * A fast, rule-based tokenizer for Spanish based on AnCora.
   * Performs punctuation splitting and light tokenization by default.
   * <p>
   * Currently, this tokenizer does not do line splitting. It assumes that the input
   * file is delimited by the system line separator. The output will be equivalently
   * delimited.
   * </p>
   *
   * @param args
   */
public static void main(String[] args) {
    final Properties options = StringUtils.argsToProperties(args, argOptionDefs());
    if (options.containsKey("help")) {
        log.info(usage());
        return;
    }
    // Lexer options
    final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
    String orthoOptions = options.containsKey("ancora") ? ANCORA_OPTIONS : "";
    if (options.containsKey("options")) {
        orthoOptions = orthoOptions.isEmpty() ? options.getProperty("options") : orthoOptions + ',' + options;
    }
    final boolean tokens = PropertiesUtils.getBool(options, "tokens", false);
    if (!tokens) {
        orthoOptions = orthoOptions.isEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
    }
    tf.setOptions(orthoOptions);
    // Other options
    final String encoding = options.getProperty("encoding", "UTF-8");
    final boolean toLower = PropertiesUtils.getBool(options, "lowerCase", false);
    final Locale es = new Locale("es");
    boolean onePerLine = PropertiesUtils.getBool(options, "onePerLine", false);
    // Read the file from stdin
    int nLines = 0;
    int nTokens = 0;
    final long startTime = System.nanoTime();
    try {
        Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding));
        boolean printSpace = false;
        while (tokenizer.hasNext()) {
            ++nTokens;
            String word = tokenizer.next().word();
            if (word.equals(SpanishLexer.NEWLINE_TOKEN)) {
                ++nLines;
                System.out.println();
                if (!onePerLine) {
                    printSpace = false;
                }
            } else {
                String outputToken = toLower ? word.toLowerCase(es) : word;
                if (onePerLine) {
                    System.out.println(outputToken);
                } else {
                    if (printSpace) {
                        System.out.print(" ");
                    }
                    System.out.print(outputToken);
                    printSpace = true;
                }
            }
        }
    } catch (UnsupportedEncodingException e) {
        throw new RuntimeIOException("Bad character encoding", e);
    }
    long elapsedTime = System.nanoTime() - startTime;
    double linesPerSec = (double) nLines / (elapsedTime / 1e9);
    System.err.printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
}
Also used : Locale(java.util.Locale) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) InputStreamReader(java.io.InputStreamReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Properties(java.util.Properties) CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Example 53 with InputStreamReader

use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.

the class ConfusionMatrixTSV method main.

public static void main(String[] args) {
    if (args.length < 1) {
        System.err.printf("Usage: java %s answers_file%n", ConfusionMatrix.class.getName());
        System.exit(-1);
    }
    try {
        ConfusionMatrix<String> cm = new ConfusionMatrix<>();
        String answersFile = args[0];
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(answersFile), "UTF-8"));
        String line = br.readLine();
        for (; line != null; line = br.readLine()) {
            String[] tokens = line.split("\\s");
            if (tokens.length != 3) {
                System.err.printf("ignoring bad line");
                continue;
            //System.exit(-1);
            }
            cm.add(tokens[2], tokens[1]);
        }
        System.out.println(cm.toString());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : ConfusionMatrix(edu.stanford.nlp.util.ConfusionMatrix) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream)

Example 54 with InputStreamReader

use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.

the class TreeToTSV method main.

public static void main(String[] args) {
    if (args.length < 1) {
        System.err.printf("Usage: java %s tree_file%n", TreeToTSV.class.getName());
        System.exit(-1);
    }
    String treeFile = args[0];
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new SpanishTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        StringBuilder sb = new StringBuilder();
        String nl = System.getProperty("line.separator");
        Pattern nePattern = Pattern.compile("^grup\\.nom\\.");
        Pattern npPattern = Pattern.compile("^np0000.$");
        for (Tree tree; (tree = tr.readTree()) != null; ) {
            for (Tree t : tree) {
                if (!t.isPreTerminal())
                    continue;
                char type = 'O';
                Tree grandma = t.ancestor(1, tree);
                String grandmaValue = ((CoreLabel) grandma.label()).value();
                // grup.nom.x
                if (nePattern.matcher(grandmaValue).find())
                    type = grandmaValue.charAt(9);
                else // else check the pos for np0000x or not
                {
                    String pos = ((CoreLabel) t.label()).value();
                    if (npPattern.matcher(pos).find())
                        type = pos.charAt(6);
                }
                Tree wordNode = t.firstChild();
                String word = ((CoreLabel) wordNode.label()).value();
                sb.append(word).append("\t");
                switch(type) {
                    case 'p':
                        sb.append("PERS");
                        break;
                    case 'l':
                        sb.append("LUG");
                        break;
                    case 'o':
                        sb.append("ORG");
                        break;
                    case '0':
                        sb.append("OTROS");
                        break;
                    default:
                        sb.append("O");
                }
                sb.append(nl);
            }
            sb.append(nl);
        }
        System.out.print(sb.toString());
        tr.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : Pattern(java.util.regex.Pattern) InputStreamReader(java.io.InputStreamReader) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 55 with InputStreamReader

use of java.io.InputStreamReader in project CoreNLP by stanfordnlp.

the class AbstractBatchOptimizer method optimize.

public <T> ConcatVector optimize(T[] dataset, AbstractDifferentiableFunction<T> fn, ConcatVector initialWeights, double l2regularization, double convergenceDerivativeNorm, boolean quiet) {
    if (!quiet)
        log.info("\n**************\nBeginning training\n");
    else
        log.info("[Beginning quiet training]");
    TrainingWorker<T> mainWorker = new TrainingWorker<>(dataset, fn, initialWeights, l2regularization, convergenceDerivativeNorm, quiet);
    new Thread(mainWorker).start();
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    if (!quiet) {
        log.info("NOTE: you can press any key (and maybe ENTER afterwards to jog stdin) to terminate learning early.");
        log.info("The convergence criteria are quite aggressive if left uninterrupted, and will run for a while");
        log.info("if left to their own devices.\n");
        while (true) {
            if (mainWorker.isFinished) {
                log.info("training completed without interruption");
                return mainWorker.weights;
            }
            try {
                if (br.ready()) {
                    log.info("received quit command: quitting");
                    log.info("training completed by interruption");
                    mainWorker.isFinished = true;
                    return mainWorker.weights;
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    } else {
        while (!mainWorker.isFinished) {
            synchronized (mainWorker.naturalTerminationBarrier) {
                try {
                    mainWorker.naturalTerminationBarrier.wait();
                } catch (InterruptedException e) {
                    throw new RuntimeInterruptedException(e);
                }
            }
        }
        log.info("[Quiet training complete]");
        return mainWorker.weights;
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) RuntimeInterruptedException(edu.stanford.nlp.util.RuntimeInterruptedException) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) RuntimeInterruptedException(edu.stanford.nlp.util.RuntimeInterruptedException)

Aggregations

InputStreamReader (java.io.InputStreamReader)4861 BufferedReader (java.io.BufferedReader)3402 IOException (java.io.IOException)2108 InputStream (java.io.InputStream)1272 FileInputStream (java.io.FileInputStream)857 URL (java.net.URL)605 ArrayList (java.util.ArrayList)559 Reader (java.io.Reader)518 File (java.io.File)514 Test (org.junit.Test)451 HttpURLConnection (java.net.HttpURLConnection)290 ByteArrayInputStream (java.io.ByteArrayInputStream)282 OutputStreamWriter (java.io.OutputStreamWriter)241 FileNotFoundException (java.io.FileNotFoundException)240 URLConnection (java.net.URLConnection)227 HashMap (java.util.HashMap)192 Socket (java.net.Socket)178 OutputStream (java.io.OutputStream)175 StringWriter (java.io.StringWriter)148 BufferedWriter (java.io.BufferedWriter)138