use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.
the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputSak.
public void processOflazerAnalyzerOutputSak(File oflazerResult, File out) throws IOException {
SimpleTextWriter sakFile = SimpleTextWriter.keepOpenUTF8Writer(out);
LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
boolean sentenceStarted = false;
List<String> parses = Lists.newArrayList();
while (li.hasNext()) {
String line = li.next().trim();
String word = Strings.subStringUntilFirst(line, "\t").trim();
if (line.length() == 0 && !sentenceStarted) {
continue;
}
if (line.length() == 0 && parses.size() > 0) {
sakFile.writeLine(processParses(parses));
parses = Lists.newArrayList();
}
if (line.length() > 0) {
if (parses.size() == 0) {
if (!sentenceStarted) {
sakFile.writeLine("<S>");
}
sentenceStarted = true;
}
if (punctuations.contains(word)) {
// because analyser i use does not parse punctuations. i do it myself.
parses.add(word + "\t" + word + "\t+Punc");
} else if (!line.endsWith("?")) {
parses.add(line);
} else if (!word.equals("#")) {
String inferred = inferUnknownWordParse(word);
System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
parses.add(inferred);
}
}
if (word.equals("#")) {
sentenceStarted = false;
sakFile.writeLine("#\t#\t+Punc");
sakFile.writeLine("</S>");
parses = new ArrayList<>();
}
}
sakFile.close();
}
use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.
the class FloatArrays method loadFromText.
/**
* loads float array from file with format: [1 2 3] [4 5 6]
*/
public static float[][] loadFromText(File input) throws IOException {
String wholeThing = new SimpleTextReader(input, "UTF-8").asString();
List<String> featureBlocks = Regexps.firstGroupMatches(FEATURE_LINES_PATTERN, wholeThing);
float[][] result = new float[featureBlocks.size()][];
int i = 0;
for (String featureBlock : featureBlocks) {
result[i] = FloatArrays.fromString(featureBlock, " ");
i++;
}
return result;
}
use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.
the class ConllTreebankReader method readSentences.
public List<DependencySentence> readSentences(File connlFile) throws IOException {
List<DependencySentence> sentences = new ArrayList<>();
List<String> lines = new SimpleTextReader(connlFile).asStringList();
List<DependencyItem> items = new ArrayList<>();
for (String line : lines) {
if (line.trim().length() == 0) {
if (items.size() > 0) {
sentences.add(new DependencySentence(items));
}
items = new ArrayList<>();
} else {
items.add(DependencyItem.buildFromConnlLine(line));
}
}
if (items.size() > 0) {
sentences.add(new DependencySentence(items));
}
return sentences;
}
use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.
the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputYuret.
public void processOflazerAnalyzerOutputYuret(File oflazerResult, File out) throws IOException {
SimpleTextWriter yuretFileWriter = SimpleTextWriter.keepOpenWriter(new FileOutputStream(out), "ISO-8859-9");
yuretFileWriter.writeLine("<DOC>\t<DOC>");
yuretFileWriter.writeLine();
LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
boolean sentenceStarted = false;
List<String> parses = new ArrayList<>();
while (li.hasNext()) {
String line = li.next().trim().replaceAll("AorPart", "PresPart");
String word = Strings.subStringUntilFirst(line, "\t").trim();
if (line.length() == 0 && !sentenceStarted) {
continue;
}
if (line.length() == 0 && parses.size() > 0) {
yuretFileWriter.writeLines(parses);
yuretFileWriter.writeLine();
yuretFileWriter.writeLine();
parses = Lists.newArrayList();
}
if (line.length() > 0) {
if (parses.size() == 0) {
if (!sentenceStarted) {
yuretFileWriter.writeLine("<S>\t<S>");
yuretFileWriter.writeLine();
}
sentenceStarted = true;
}
if (punctuations.contains(word)) {
// because analyser i use does not parse punctuations. i do it myself.
parses.add(word + "\t" + word + "\t+Punc");
} else if (!line.endsWith("?")) {
parses.add(line);
} else if (!word.equals("#")) {
String inferred = inferUnknownWordParse(word);
System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
parses.add(inferred);
}
}
if (word.equals("#")) {
sentenceStarted = false;
yuretFileWriter.writeLine("</S>\t</S>\n");
parses = new ArrayList<>();
}
}
yuretFileWriter.writeLine("</DOC>\t</DOC>");
yuretFileWriter.close();
}
Aggregations