Search in sources :

Example 41 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class DependencyParser method setupClassifierForTraining.

/**
 * Prepare a classifier for training with the given dataset.
 */
private void setupClassifierForTraining(List<CoreMap> trainSents, List<DependencyTree> trainTrees, String embedFile, String preModel) {
    float[][] E = new float[knownWords.size() + knownPos.size() + knownLabels.size()][config.embeddingSize];
    float[][] W1 = new float[config.hiddenSize][config.embeddingSize * config.numTokens];
    float[] b1 = new float[config.hiddenSize];
    float[][] W2 = new float[system.numTransitions()][config.hiddenSize];
    // Randomly initialize weight matrices / vectors
    Random random = Util.getRandom();
    for (int i = 0; i < W1.length; ++i) for (int j = 0; j < W1[i].length; ++j) W1[i][j] = (float) (random.nextDouble() * 2 * config.initRange - config.initRange);
    for (int i = 0; i < b1.length; ++i) b1[i] = (float) (random.nextDouble() * 2 * config.initRange - config.initRange);
    for (int i = 0; i < W2.length; ++i) for (int j = 0; j < W2[i].length; ++j) W2[i][j] = (float) (random.nextDouble() * 2 * config.initRange - config.initRange);
    // Read embeddings into `embedID`, `embeddings`
    Map<String, Integer> embedID = new HashMap<>();
    double[][] embeddings = readEmbedFile(embedFile, embedID);
    // Try to match loaded embeddings with words in dictionary
    int foundEmbed = 0;
    for (int i = 0; i < E.length; ++i) {
        int index = -1;
        if (i < knownWords.size()) {
            String str = knownWords.get(i);
            // NOTE: exact match first, and then try lower case..
            if (embedID.containsKey(str))
                index = embedID.get(str);
            else if (embedID.containsKey(str.toLowerCase()))
                index = embedID.get(str.toLowerCase());
        }
        if (index >= 0) {
            ++foundEmbed;
            for (int j = 0; j < E[i].length; ++j) {
                E[i][j] = (float) embeddings[index][j];
            }
        } else {
            for (int j = 0; j < E[i].length; ++j) {
                // E[i][j] = random.nextDouble() * config.initRange * 2 - config.initRange;
                // E[i][j] = random.nextDouble() * 0.2 - 0.1;
                // E[i][j] = random.nextGaussian() * Math.sqrt(0.1);
                E[i][j] = (float) (random.nextDouble() * 0.02 - 0.01);
            }
        }
    }
    log.info("Found embeddings: " + foundEmbed + " / " + knownWords.size());
    if (preModel != null) {
        try (BufferedReader input = IOUtils.readerFromString(preModel)) {
            log.info("Loading pre-trained model file: " + preModel + " ... ");
            String s;
            s = input.readLine();
            int nDict = Integer.parseInt(s.substring(s.indexOf('=') + 1));
            s = input.readLine();
            int nPOS = Integer.parseInt(s.substring(s.indexOf('=') + 1));
            s = input.readLine();
            int nLabel = Integer.parseInt(s.substring(s.indexOf('=') + 1));
            s = input.readLine();
            int eSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
            s = input.readLine();
            int hSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
            s = input.readLine();
            int nTokens = Integer.parseInt(s.substring(s.indexOf('=') + 1));
            s = input.readLine();
            String[] splits;
            for (int k = 0; k < nDict; ++k) {
                s = input.readLine();
                splits = s.split(" ");
                if (wordIDs.containsKey(splits[0]) && eSize == config.embeddingSize) {
                    int index = getWordID(splits[0]);
                    for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
                }
            }
            for (int k = 0; k < nPOS; ++k) {
                s = input.readLine();
                splits = s.split(" ");
                if (posIDs.containsKey(splits[0]) && eSize == config.embeddingSize) {
                    int index = getPosID(splits[0]);
                    for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
                }
            }
            for (int k = 0; k < nLabel; ++k) {
                s = input.readLine();
                splits = s.split(" ");
                if (labelIDs.containsKey(splits[0]) && eSize == config.embeddingSize) {
                    int index = getLabelID(splits[0]);
                    for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
                }
            }
            boolean copyLayer1 = hSize == config.hiddenSize && config.embeddingSize == eSize && config.numTokens == nTokens;
            if (copyLayer1) {
                log.info("Copying parameters W1 && b1...");
            }
            for (int j = 0; j < eSize * nTokens; ++j) {
                s = input.readLine();
                if (copyLayer1) {
                    splits = s.split(" ");
                    for (int i = 0; i < hSize; ++i) W1[i][j] = Float.parseFloat(splits[i]);
                }
            }
            s = input.readLine();
            if (copyLayer1) {
                splits = s.split(" ");
                for (int i = 0; i < hSize; ++i) b1[i] = Float.parseFloat(splits[i]);
            }
            boolean copyLayer2 = (nLabel * 2 - 1 == system.numTransitions()) && hSize == config.hiddenSize;
            if (copyLayer2)
                log.info("Copying parameters W2...");
            for (int j = 0; j < hSize; ++j) {
                s = input.readLine();
                if (copyLayer2) {
                    splits = s.split(" ");
                    for (int i = 0; i < nLabel * 2 - 1; ++i) W2[i][j] = Float.parseFloat(splits[i]);
                }
            }
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }
    Dataset trainSet = genTrainExamples(trainSents, trainTrees);
    classifier = new Classifier(config, trainSet, E, W1, b1, W2, preComputed);
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) BufferedReader(java.io.BufferedReader)

Example 42 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class DependencyParser method loadModelFile.

void loadModelFile(String modelFile, boolean verbose) {
    Timing t = new Timing();
    try (BufferedReader input = IOUtils.readerFromString(modelFile)) {
        // first line in newer saved models is language, legacy models don't store this
        String s = input.readLine();
        // check if language was stored
        if (isModelNewFormat(s)) {
            // set up language
            config.language = Config.getLanguage(s.substring(9, s.length() - 1));
            // set up tlp
            s = input.readLine();
            String tlpCanonicalName = s.substring(4);
            try {
                config.tlp = ReflectionLoading.loadByReflection(tlpCanonicalName);
                log.info("Loaded TreebankLanguagePack: " + tlpCanonicalName);
            } catch (Exception e) {
                log.warn("Error: Failed to load TreebankLanguagePack: " + tlpCanonicalName);
            }
            s = input.readLine();
        }
        int nDict = Integer.parseInt(s.substring(s.indexOf('=') + 1));
        s = input.readLine();
        int nPOS = Integer.parseInt(s.substring(s.indexOf('=') + 1));
        s = input.readLine();
        int nLabel = Integer.parseInt(s.substring(s.indexOf('=') + 1));
        s = input.readLine();
        int eSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
        s = input.readLine();
        int hSize = Integer.parseInt(s.substring(s.indexOf('=') + 1));
        s = input.readLine();
        int nTokens = Integer.parseInt(s.substring(s.indexOf('=') + 1));
        s = input.readLine();
        int nPreComputed = Integer.parseInt(s.substring(s.indexOf('=') + 1));
        knownWords = new ArrayList<>();
        knownPos = new ArrayList<>();
        knownLabels = new ArrayList<>();
        float[][] E = new float[nDict + nPOS + nLabel][eSize];
        String[] splits;
        int index = 0;
        for (int k = 0; k < nDict; ++k) {
            s = input.readLine();
            splits = s.split(" ");
            knownWords.add(splits[0]);
            for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
            index = index + 1;
        }
        for (int k = 0; k < nPOS; ++k) {
            s = input.readLine();
            splits = s.split(" ");
            knownPos.add(splits[0]);
            for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
            index = index + 1;
        }
        for (int k = 0; k < nLabel; ++k) {
            s = input.readLine();
            splits = s.split(" ");
            knownLabels.add(splits[0]);
            for (int i = 0; i < eSize; ++i) E[index][i] = Float.parseFloat(splits[i + 1]);
            index = index + 1;
        }
        generateIDs();
        float[][] W1 = new float[hSize][eSize * nTokens];
        for (int j = 0; j < W1[0].length; ++j) {
            s = input.readLine();
            splits = s.split(" ");
            for (int i = 0; i < W1.length; ++i) W1[i][j] = Float.parseFloat(splits[i]);
        }
        float[] b1 = new float[hSize];
        s = input.readLine();
        splits = s.split(" ");
        for (int i = 0; i < b1.length; ++i) b1[i] = Float.parseFloat(splits[i]);
        float[][] W2 = new float[nLabel * 2 - 1][hSize];
        for (int j = 0; j < W2[0].length; ++j) {
            s = input.readLine();
            splits = s.split(" ");
            for (int i = 0; i < W2.length; ++i) W2[i][j] = Float.parseFloat(splits[i]);
        }
        preComputed = new ArrayList<>();
        while (preComputed.size() < nPreComputed) {
            s = input.readLine();
            splits = s.split(" ");
            for (String split : splits) {
                preComputed.add(Integer.parseInt(split));
            }
        }
        config.hiddenSize = hSize;
        config.embeddingSize = eSize;
        log.debug("Read in dep parse matrices:");
        log.debug("   E: " + E.length * E[0].length);
        log.debug("  b1: " + b1.length);
        log.debug("  W1: " + W1.length * W1[0].length);
        log.debug("  W2: " + W2.length * W2[0].length);
        classifier = new Classifier(config, E, W1, b1, W2, preComputed);
        t.report(log, "Loading depparse model: " + modelFile);
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    // initialize the loaded parser
    initialize(verbose);
    t.done(log, "Initializing dependency parser");
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) BufferedReader(java.io.BufferedReader) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException)

Example 43 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class Util method writeConllFile.

public static void writeConllFile(String outFile, List<CoreMap> sentences, List<DependencyTree> trees) {
    try {
        PrintWriter output = IOUtils.getPrintWriter(outFile);
        for (int i = 0; i < sentences.size(); i++) {
            CoreMap sentence = sentences.get(i);
            DependencyTree tree = trees.get(i);
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            for (int j = 1, size = tokens.size(); j <= size; ++j) {
                CoreLabel token = tokens.get(j - 1);
                output.printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.word(), token.tag(), token.tag(), tree.getHead(j), tree.getLabel(j));
            }
            output.println();
        }
        output.close();
    } catch (Exception e) {
        throw new RuntimeIOException(e);
    }
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException)

Example 44 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class PTBTokenizer method ptb2Text.

/**
 * Returns a presentable version of the given PTB-tokenized text.
 * PTB tokenization splits up punctuation and does various other things
 * that makes simply joining the tokens with spaces look bad. So join
 * the tokens with space and run it through this method to produce nice
 * looking text. It's not perfect, but it works pretty well.
 * <p>
 * <b>Note:</b> If your tokens have maintained the OriginalTextAnnotation and
 * the BeforeAnnotation and the AfterAnnotation, then rather than doing
 * this you can actually precisely reconstruct the text they were made
 * from!
 *
 * @param ptbText A String in PTB3-escaped form
 * @return An approximation to the original String
 */
public static String ptb2Text(String ptbText) {
    // probably an overestimate
    StringBuilder sb = new StringBuilder(ptbText.length());
    PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
    try {
        for (String token; (token = lexer.next()) != null; ) {
            sb.append(token);
        }
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
    return sb.toString();
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException)

Example 45 with RuntimeIOException

use of edu.stanford.nlp.io.RuntimeIOException in project CoreNLP by stanfordnlp.

the class ReadSentimentDataset method writeTrees.

private static void writeTrees(String filename, List<Tree> trees, List<Integer> treeIds) {
    try {
        FileOutputStream fos = new FileOutputStream(filename);
        BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(fos));
        for (Integer id : treeIds) {
            bout.write(trees.get(id).toString());
            bout.write("\n");
        }
        bout.flush();
        fos.close();
    } catch (IOException e) {
        throw new RuntimeIOException(e);
    }
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) IOException(java.io.IOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) BufferedWriter(java.io.BufferedWriter)

Aggregations

RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)114 IOException (java.io.IOException)61 BufferedReader (java.io.BufferedReader)22 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)12 CoreLabel (edu.stanford.nlp.ling.CoreLabel)11 File (java.io.File)9 ArrayList (java.util.ArrayList)7 Tree (edu.stanford.nlp.trees.Tree)6 CoreMap (edu.stanford.nlp.util.CoreMap)5 BufferedWriter (java.io.BufferedWriter)5 Properties (java.util.Properties)5 Timing (edu.stanford.nlp.util.Timing)4 FileNotFoundException (java.io.FileNotFoundException)4 FileOutputStream (java.io.FileOutputStream)4 ObjectOutputStream (java.io.ObjectOutputStream)4 PrintWriter (java.io.PrintWriter)4 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)3 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)3