Search in sources :

Example 1 with SimpleTextReader

use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.

the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputSak.

public void processOflazerAnalyzerOutputSak(File oflazerResult, File out) throws IOException {
    SimpleTextWriter sakFile = SimpleTextWriter.keepOpenUTF8Writer(out);
    LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
    boolean sentenceStarted = false;
    List<String> parses = Lists.newArrayList();
    while (li.hasNext()) {
        String line = li.next().trim();
        String word = Strings.subStringUntilFirst(line, "\t").trim();
        if (line.length() == 0 && !sentenceStarted) {
            continue;
        }
        if (line.length() == 0 && parses.size() > 0) {
            sakFile.writeLine(processParses(parses));
            parses = Lists.newArrayList();
        }
        if (line.length() > 0) {
            if (parses.size() == 0) {
                if (!sentenceStarted) {
                    sakFile.writeLine("<S>");
                }
                sentenceStarted = true;
            }
            if (punctuations.contains(word)) {
                // because analyser i use does not parse punctuations. i do it myself.
                parses.add(word + "\t" + word + "\t+Punc");
            } else if (!line.endsWith("?")) {
                parses.add(line);
            } else if (!word.equals("#")) {
                String inferred = inferUnknownWordParse(word);
                System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
                parses.add(inferred);
            }
        }
        if (word.equals("#")) {
            sentenceStarted = false;
            sakFile.writeLine("#\t#\t+Punc");
            sakFile.writeLine("</S>");
            parses = new ArrayList<>();
        }
    }
    sakFile.close();
}
Also used : SimpleTextReader(zemberek.core.io.SimpleTextReader) LineIterator(zemberek.core.io.LineIterator) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 2 with SimpleTextReader

use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.

the class FloatArrays method loadFromText.

/**
 * loads float array from file with format: [1 2 3] [4 5 6]
 */
public static float[][] loadFromText(File input) throws IOException {
    String wholeThing = new SimpleTextReader(input, "UTF-8").asString();
    List<String> featureBlocks = Regexps.firstGroupMatches(FEATURE_LINES_PATTERN, wholeThing);
    float[][] result = new float[featureBlocks.size()][];
    int i = 0;
    for (String featureBlock : featureBlocks) {
        result[i] = FloatArrays.fromString(featureBlock, " ");
        i++;
    }
    return result;
}
Also used : SimpleTextReader(zemberek.core.io.SimpleTextReader)

Example 3 with SimpleTextReader

use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.

the class ConllTreebankReader method readSentences.

public List<DependencySentence> readSentences(File connlFile) throws IOException {
    List<DependencySentence> sentences = new ArrayList<>();
    List<String> lines = new SimpleTextReader(connlFile).asStringList();
    List<DependencyItem> items = new ArrayList<>();
    for (String line : lines) {
        if (line.trim().length() == 0) {
            if (items.size() > 0) {
                sentences.add(new DependencySentence(items));
            }
            items = new ArrayList<>();
        } else {
            items.add(DependencyItem.buildFromConnlLine(line));
        }
    }
    if (items.size() > 0) {
        sentences.add(new DependencySentence(items));
    }
    return sentences;
}
Also used : ArrayList(java.util.ArrayList) SimpleTextReader(zemberek.core.io.SimpleTextReader)

Example 4 with SimpleTextReader

use of zemberek.core.io.SimpleTextReader in project zemberek-nlp by ahmetaa.

the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputYuret.

public void processOflazerAnalyzerOutputYuret(File oflazerResult, File out) throws IOException {
    SimpleTextWriter yuretFileWriter = SimpleTextWriter.keepOpenWriter(new FileOutputStream(out), "ISO-8859-9");
    yuretFileWriter.writeLine("<DOC>\t<DOC>");
    yuretFileWriter.writeLine();
    LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
    boolean sentenceStarted = false;
    List<String> parses = new ArrayList<>();
    while (li.hasNext()) {
        String line = li.next().trim().replaceAll("AorPart", "PresPart");
        String word = Strings.subStringUntilFirst(line, "\t").trim();
        if (line.length() == 0 && !sentenceStarted) {
            continue;
        }
        if (line.length() == 0 && parses.size() > 0) {
            yuretFileWriter.writeLines(parses);
            yuretFileWriter.writeLine();
            yuretFileWriter.writeLine();
            parses = Lists.newArrayList();
        }
        if (line.length() > 0) {
            if (parses.size() == 0) {
                if (!sentenceStarted) {
                    yuretFileWriter.writeLine("<S>\t<S>");
                    yuretFileWriter.writeLine();
                }
                sentenceStarted = true;
            }
            if (punctuations.contains(word)) {
                // because analyser i use does not parse punctuations. i do it myself.
                parses.add(word + "\t" + word + "\t+Punc");
            } else if (!line.endsWith("?")) {
                parses.add(line);
            } else if (!word.equals("#")) {
                String inferred = inferUnknownWordParse(word);
                System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
                parses.add(inferred);
            }
        }
        if (word.equals("#")) {
            sentenceStarted = false;
            yuretFileWriter.writeLine("</S>\t</S>\n");
            parses = new ArrayList<>();
        }
    }
    yuretFileWriter.writeLine("</DOC>\t</DOC>");
    yuretFileWriter.close();
}
Also used : FileOutputStream(java.io.FileOutputStream) SimpleTextReader(zemberek.core.io.SimpleTextReader) ArrayList(java.util.ArrayList) LineIterator(zemberek.core.io.LineIterator) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Aggregations

SimpleTextReader (zemberek.core.io.SimpleTextReader)4 ArrayList (java.util.ArrayList)2 LineIterator (zemberek.core.io.LineIterator)2 SimpleTextWriter (zemberek.core.io.SimpleTextWriter)2 FileOutputStream (java.io.FileOutputStream)1