use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class MarkovModelDisambiguator method generateTrainingCorpus.
public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus) throws IOException {
DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader());
System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size());
System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount());
SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus);
System.out.println("Generating Lemma Corpus.");
for (AbstractDisambiguator.SentenceData sentenceData : trainingSet) {
List<String> roots = Lists.newArrayList("<s>");
for (WordData word : sentenceData.allWordAnalyses) {
WordParse parse = new WordParse(word.correctParse);
String rootPart = parse.root;
roots.add(rootPart);
}
roots.add("</s>");
rootWriter.writeLine(Joiner.on(" ").join(roots));
}
rootWriter.close();
SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus);
System.out.println("Generating IG Corpus.");
WordParse start = new WordParse(BEGIN_SENTENCE);
WordParse end = new WordParse(END_SENTENCE);
for (SentenceData sentenceData : trainingSet) {
if (sentenceData.allWordAnalyses.size() == 0) {
continue;
}
WordParse first = start;
WordParse second = new WordParse(sentenceData.allWordAnalyses.get(0).correctParse);
for (int i = 1; i < sentenceData.allWordAnalyses.size(); i++) {
WordParse third = new WordParse(sentenceData.allWordAnalyses.get(i).correctParse);
for (int j = 0; j < third.igs.size(); j++) {
igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + third.igs.get(j));
}
first = second;
second = third;
}
igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + end.getLastIg());
}
igWriter.close();
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class Z3ModelA method generateTrainingCorpus.
public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus) throws IOException {
DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader());
System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size());
System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount());
SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus);
System.out.println("Generating Lemma Corpus.");
for (SentenceData sentenceData : trainingSet) {
List<String> roots = Lists.newArrayList("<s>");
for (Z3WordData word : sentenceData.words) {
Z3WordParse parse = new Z3WordParse(word.correctParse);
roots.add(parse.root);
}
roots.add("</s>");
rootWriter.writeLine(Joiner.on(" ").join(roots));
}
rootWriter.close();
SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus);
System.out.println("Generating IG Corpus.");
Z3WordParse start = new Z3WordParse(SENTENCE_START_PARSE);
Z3WordParse end = new Z3WordParse(SENTENCE_END_PARSE);
for (SentenceData sentenceData : trainingSet) {
if (sentenceData.words.size() == 0) {
continue;
}
Z3WordParse first = start;
Z3WordParse second = new Z3WordParse(sentenceData.words.get(0).correctParse);
for (int i = 1; i < sentenceData.words.size(); i++) {
Z3WordParse third = new Z3WordParse(sentenceData.words.get(i).correctParse);
for (int j = 0; j < third.igs.size(); j++) {
igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + third.igs.get(j));
}
first = second;
second = third;
}
igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + end.getLastIg());
}
igWriter.close();
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputSak.
public void processOflazerAnalyzerOutputSak(File oflazerResult, File out) throws IOException {
SimpleTextWriter sakFile = SimpleTextWriter.keepOpenUTF8Writer(out);
LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
boolean sentenceStarted = false;
List<String> parses = Lists.newArrayList();
while (li.hasNext()) {
String line = li.next().trim();
String word = Strings.subStringUntilFirst(line, "\t").trim();
if (line.length() == 0 && !sentenceStarted) {
continue;
}
if (line.length() == 0 && parses.size() > 0) {
sakFile.writeLine(processParses(parses));
parses = Lists.newArrayList();
}
if (line.length() > 0) {
if (parses.size() == 0) {
if (!sentenceStarted) {
sakFile.writeLine("<S>");
}
sentenceStarted = true;
}
if (punctuations.contains(word)) {
// because analyser i use does not parse punctuations. i do it myself.
parses.add(word + "\t" + word + "\t+Punc");
} else if (!line.endsWith("?")) {
parses.add(line);
} else if (!word.equals("#")) {
String inferred = inferUnknownWordParse(word);
System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
parses.add(inferred);
}
}
if (word.equals("#")) {
sentenceStarted = false;
sakFile.writeLine("#\t#\t+Punc");
sakFile.writeLine("</S>");
parses = new ArrayList<>();
}
}
sakFile.close();
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class ConllTreebankReader method separateShortAndLongSentences.
public void separateShortAndLongSentences(List<DependencySentence> sentences, int shortAmount) throws IOException {
List<DependencySentence> sentencesToBeSorted = new ArrayList<>(sentences);
sentencesToBeSorted.sort((o1, o2) -> Ints.compare(o1.items.size(), o2.items.size()));
SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(new File("shorts.conll"));
List<DependencySentence> shorts = new ArrayList<>();
List<DependencySentence> longs = new ArrayList<>();
for (DependencySentence dependencySentence : sentencesToBeSorted) {
if (dependencySentence.lemmaCount() > 2 && shorts.size() < shortAmount) {
shorts.add(dependencySentence);
} else {
longs.add(dependencySentence);
}
}
for (DependencySentence dependencySentence : shorts) {
stw.writeLine(dependencySentence.getAsConnlString());
stw.writeLine();
}
stw.close();
stw = SimpleTextWriter.keepOpenUTF8Writer(new File("longs.conll"));
for (DependencySentence dependencySentence : longs) {
stw.writeLine(dependencySentence.getAsConnlString());
stw.writeLine();
}
stw.close();
}
use of zemberek.core.io.SimpleTextWriter in project zemberek-nlp by ahmetaa.
the class Z3MarkovModelDisambiguator method generateTrainingCorpus.
public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus) throws IOException {
DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader());
System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size());
System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount());
SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus);
SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus);
for (SentenceData sentenceData : trainingSet) {
List<String> roots = Lists.newArrayList("<s>");
List<String> igs = Lists.newArrayList(START_IG);
for (Z3WordData word : sentenceData.words) {
Z3WordParse parse = new Z3WordParse(word.correctParse);
int igSize = parse.igs.size();
String rootPart = parse.root;
String firstIg = parse.igs.get(0);
if (!firstIg.contains(";")) {
rootPart += firstIg;
} else {
String suffixPart = Strings.subStringAfterFirst(firstIg, ";");
if (suffixPart.equals("A3sg+Pnon+Nom)")) {
rootPart += (Strings.subStringUntilFirst(firstIg, ";") + ")");
}
}
roots.add(rootPart);
String igPart;
if (igSize > 1 && !parse.igs.get(igSize - 2).contains(";")) {
igPart = parse.igs.get(igSize - 2) + parse.getLastIg();
} else {
igPart = parse.getLastIg();
}
igs.add(igPart);
}
roots.add("</s>");
igs.add(END_IG);
rootWriter.writeLine(Joiner.on(" ").join(roots));
igWriter.writeLine(Joiner.on(" ").join(igs));
}
rootWriter.close();
igWriter.close();
}
Aggregations