Search in sources :

Example 1 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class MarkovModelDisambiguator method generateTrainingCorpus.

public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus) throws IOException {
    DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader());
    System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size());
    System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount());
    SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus);
    System.out.println("Generating Lemma Corpus.");
    for (AbstractDisambiguator.SentenceData sentenceData : trainingSet) {
        List<String> roots = Lists.newArrayList("<s>");
        for (WordData word : sentenceData.allWordAnalyses) {
            WordParse parse = new WordParse(word.correctParse);
            String rootPart = parse.root;
            roots.add(rootPart);
        }
        roots.add("</s>");
        rootWriter.writeLine(Joiner.on(" ").join(roots));
    }
    rootWriter.close();
    SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus);
    System.out.println("Generating IG Corpus.");
    WordParse start = new WordParse(BEGIN_SENTENCE);
    WordParse end = new WordParse(END_SENTENCE);
    for (SentenceData sentenceData : trainingSet) {
        if (sentenceData.allWordAnalyses.size() == 0) {
            continue;
        }
        WordParse first = start;
        WordParse second = new WordParse(sentenceData.allWordAnalyses.get(0).correctParse);
        for (int i = 1; i < sentenceData.allWordAnalyses.size(); i++) {
            WordParse third = new WordParse(sentenceData.allWordAnalyses.get(i).correctParse);
            for (int j = 0; j < third.igs.size(); j++) {
                igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + third.igs.get(j));
            }
            first = second;
            second = third;
        }
        igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + end.getLastIg());
    }
    igWriter.close();
}
Also used : SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 2 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class Z3ModelA method generateTrainingCorpus.

public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus) throws IOException {
    DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader());
    System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size());
    System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount());
    SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus);
    System.out.println("Generating Lemma Corpus.");
    for (SentenceData sentenceData : trainingSet) {
        List<String> roots = Lists.newArrayList("<s>");
        for (Z3WordData word : sentenceData.words) {
            Z3WordParse parse = new Z3WordParse(word.correctParse);
            roots.add(parse.root);
        }
        roots.add("</s>");
        rootWriter.writeLine(Joiner.on(" ").join(roots));
    }
    rootWriter.close();
    SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus);
    System.out.println("Generating IG Corpus.");
    Z3WordParse start = new Z3WordParse(SENTENCE_START_PARSE);
    Z3WordParse end = new Z3WordParse(SENTENCE_END_PARSE);
    for (SentenceData sentenceData : trainingSet) {
        if (sentenceData.words.size() == 0) {
            continue;
        }
        Z3WordParse first = start;
        Z3WordParse second = new Z3WordParse(sentenceData.words.get(0).correctParse);
        for (int i = 1; i < sentenceData.words.size(); i++) {
            Z3WordParse third = new Z3WordParse(sentenceData.words.get(i).correctParse);
            for (int j = 0; j < third.igs.size(); j++) {
                igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + third.igs.get(j));
            }
            first = second;
            second = third;
        }
        igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + end.getLastIg());
    }
    igWriter.close();
}
Also used : SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 3 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputSak.

public void processOflazerAnalyzerOutputSak(File oflazerResult, File out) throws IOException {
    SimpleTextWriter sakFile = SimpleTextWriter.keepOpenUTF8Writer(out);
    LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
    boolean sentenceStarted = false;
    List<String> parses = Lists.newArrayList();
    while (li.hasNext()) {
        String line = li.next().trim();
        String word = Strings.subStringUntilFirst(line, "\t").trim();
        if (line.length() == 0 && !sentenceStarted) {
            continue;
        }
        if (line.length() == 0 && parses.size() > 0) {
            sakFile.writeLine(processParses(parses));
            parses = Lists.newArrayList();
        }
        if (line.length() > 0) {
            if (parses.size() == 0) {
                if (!sentenceStarted) {
                    sakFile.writeLine("<S>");
                }
                sentenceStarted = true;
            }
            if (punctuations.contains(word)) {
                // because analyser i use does not parse punctuations. i do it myself.
                parses.add(word + "\t" + word + "\t+Punc");
            } else if (!line.endsWith("?")) {
                parses.add(line);
            } else if (!word.equals("#")) {
                String inferred = inferUnknownWordParse(word);
                System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
                parses.add(inferred);
            }
        }
        if (word.equals("#")) {
            sentenceStarted = false;
            sakFile.writeLine("#\t#\t+Punc");
            sakFile.writeLine("</S>");
            parses = new ArrayList<>();
        }
    }
    sakFile.close();
}
Also used : SimpleTextReader(zemberek.core.io.SimpleTextReader) LineIterator(zemberek.core.io.LineIterator) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 4 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class ConllTreebankReader method separateShortAndLongSentences.

public void separateShortAndLongSentences(List<DependencySentence> sentences, int shortAmount) throws IOException {
    List<DependencySentence> sentencesToBeSorted = new ArrayList<>(sentences);
    sentencesToBeSorted.sort((o1, o2) -> Ints.compare(o1.items.size(), o2.items.size()));
    SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(new File("shorts.conll"));
    List<DependencySentence> shorts = new ArrayList<>();
    List<DependencySentence> longs = new ArrayList<>();
    for (DependencySentence dependencySentence : sentencesToBeSorted) {
        if (dependencySentence.lemmaCount() > 2 && shorts.size() < shortAmount) {
            shorts.add(dependencySentence);
        } else {
            longs.add(dependencySentence);
        }
    }
    for (DependencySentence dependencySentence : shorts) {
        stw.writeLine(dependencySentence.getAsConnlString());
        stw.writeLine();
    }
    stw.close();
    stw = SimpleTextWriter.keepOpenUTF8Writer(new File("longs.conll"));
    for (DependencySentence dependencySentence : longs) {
        stw.writeLine(dependencySentence.getAsConnlString());
        stw.writeLine();
    }
    stw.close();
}
Also used : ArrayList(java.util.ArrayList) File(java.io.File) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 5 with SimpleTextWriter

use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.

the class Z3MarkovModelDisambiguator method generateTrainingCorpus.

public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus) throws IOException {
    DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader());
    System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size());
    System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount());
    SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus);
    SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus);
    for (SentenceData sentenceData : trainingSet) {
        List<String> roots = Lists.newArrayList("<s>");
        List<String> igs = Lists.newArrayList(START_IG);
        for (Z3WordData word : sentenceData.words) {
            Z3WordParse parse = new Z3WordParse(word.correctParse);
            int igSize = parse.igs.size();
            String rootPart = parse.root;
            String firstIg = parse.igs.get(0);
            if (!firstIg.contains(";")) {
                rootPart += firstIg;
            } else {
                String suffixPart = Strings.subStringAfterFirst(firstIg, ";");
                if (suffixPart.equals("A3sg+Pnon+Nom)")) {
                    rootPart += (Strings.subStringUntilFirst(firstIg, ";") + ")");
                }
            }
            roots.add(rootPart);
            String igPart;
            if (igSize > 1 && !parse.igs.get(igSize - 2).contains(";")) {
                igPart = parse.igs.get(igSize - 2) + parse.getLastIg();
            } else {
                igPart = parse.getLastIg();
            }
            igs.add(igPart);
        }
        roots.add("</s>");
        igs.add(END_IG);
        rootWriter.writeLine(Joiner.on(" ").join(roots));
        igWriter.writeLine(Joiner.on(" ").join(igs));
    }
    rootWriter.close();
    igWriter.close();
}
Also used : SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Aggregations

SimpleTextWriter (zemberek.core.io.SimpleTextWriter)12 File (java.io.File)2 ArrayList (java.util.ArrayList)2 Histogram (zemberek.core.collections.Histogram)2 LineIterator (zemberek.core.io.LineIterator)2 SimpleTextReader (zemberek.core.io.SimpleTextReader)2 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 InputStreamReader (java.io.InputStreamReader)1 Collator (java.text.Collator)1 LinkedHashSet (java.util.LinkedHashSet)1