Search in sources :

Example 1 with LineIterator

use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.

the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputSak.

public void processOflazerAnalyzerOutputSak(File oflazerResult, File out) throws IOException {
    SimpleTextWriter sakFile = SimpleTextWriter.keepOpenUTF8Writer(out);
    LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
    boolean sentenceStarted = false;
    List<String> parses = Lists.newArrayList();
    while (li.hasNext()) {
        String line = li.next().trim();
        String word = Strings.subStringUntilFirst(line, "\t").trim();
        if (line.length() == 0 && !sentenceStarted) {
            continue;
        }
        if (line.length() == 0 && parses.size() > 0) {
            sakFile.writeLine(processParses(parses));
            parses = Lists.newArrayList();
        }
        if (line.length() > 0) {
            if (parses.size() == 0) {
                if (!sentenceStarted) {
                    sakFile.writeLine("<S>");
                }
                sentenceStarted = true;
            }
            if (punctuations.contains(word)) {
                // because analyser i use does not parse punctuations. i do it myself.
                parses.add(word + "\t" + word + "\t+Punc");
            } else if (!line.endsWith("?")) {
                parses.add(line);
            } else if (!word.equals("#")) {
                String inferred = inferUnknownWordParse(word);
                System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
                parses.add(inferred);
            }
        }
        if (word.equals("#")) {
            sentenceStarted = false;
            sakFile.writeLine("#\t#\t+Punc");
            sakFile.writeLine("</S>");
            parses = new ArrayList<>();
        }
    }
    sakFile.close();
}
Also used : SimpleTextReader(zemberek.core.io.SimpleTextReader) LineIterator(zemberek.core.io.LineIterator) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 2 with LineIterator

use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method readAll.

public List<String> readAll(String filename) throws IOException {
    List<String> lines = new ArrayList<>();
    File file = new File(filename);
    LineIterator it = SimpleTextReader.trimmingUTF8Reader(file).getLineIterator();
    while (it.hasNext()) {
        String quotesHyphensNormalzied = TextUtil.normalizeQuotesHyphens(it.next());
        lines.add(Joiner.on(" ").join(lexer.tokenizeToStrings(quotesHyphensNormalzied)));
    }
    return lines;
}
Also used : ArrayList(java.util.ArrayList) File(java.io.File) LineIterator(zemberek.core.io.LineIterator)

Example 3 with LineIterator

use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testActualData.

@Test
@Ignore
public void testActualData() throws IOException {
    Stopwatch sw = Stopwatch.createStarted();
    File lmFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.slm");
    File tmp = new File("/tmp");
    if (!lmFile.exists()) {
        final File arpaFile = new File("/home/ahmetaa/data/lm/smoothnlp-test/lm1.arpa");
        UncompressedToSmoothLmConverter converter = new UncompressedToSmoothLmConverter(lmFile, tmp);
        converter.convertLarge(MultiFileUncompressedLm.generate(arpaFile, tmp, "utf-8", 4).dir, new UncompressedToSmoothLmConverter.NgramDataBlock(2, 1, 1), 20);
    }
    SmoothLm lm = SmoothLm.builder(lmFile).build();
    System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
    sw.reset();
    final int order = 3;
    final int gramCount = 1000000;
    int[][] ids = new int[gramCount][order];
    long[] trigrams = new long[gramCount];
    LineIterator li = SimpleTextReader.trimmingUTF8LineIterator(new File("/home/ahmetaa/data/lm/smoothnlp-test/corpus-lowercase_1000000_2000000"));
    SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
    int i = 0;
    while (i < gramCount) {
        String line = li.next();
        String[] tokens = tokenizer.split(line);
        if (tokens.length < order) {
            continue;
        }
        for (int j = 0; j < tokens.length - order - 1; j++) {
            String[] words = new String[order];
            System.arraycopy(tokens, j, words, 0, order);
            int[] indexes = lm.getVocabulary().toIndexes(words);
            if (!lm.getVocabulary().containsAll(indexes)) {
                continue;
            }
            ids[i] = indexes;
            if (order == 3) {
                trigrams[i] = lm.getVocabulary().encodeTrigram(indexes);
            }
            i++;
            if (i == gramCount) {
                break;
            }
        }
    }
    sw.start();
    double tr = 0;
    for (int[] id : ids) {
        tr += lm.getProbability(id);
    }
    System.out.println(sw.elapsed(TimeUnit.MILLISECONDS));
    System.out.println("tr = " + tr);
}
Also used : Stopwatch(com.google.common.base.Stopwatch) LineIterator(zemberek.core.io.LineIterator) File(java.io.File) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 4 with LineIterator

use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method readAll.

public List<String> readAll(String filename) throws IOException {
    List<String> lines = new ArrayList<>();
    File file = new File(filename);
    LineIterator it = SimpleTextReader.trimmingUTF8Reader(file).getLineIterator();
    while (it.hasNext()) {
        String quotesHyphensNormalzied = TextUtil.normalizeQuotesHyphens(it.next());
        lines.add(Joiner.on(" ").join(lexer.tokenizeToStrings(quotesHyphensNormalzied)));
    }
    return lines;
}
Also used : ArrayList(java.util.ArrayList) File(java.io.File) LineIterator(zemberek.core.io.LineIterator)

Example 5 with LineIterator

use of zemberek.core.io.LineIterator in project zemberek-nlp by ahmetaa.

the class DisambiguatorPreprocessor method processOflazerAnalyzerOutputYuret.

public void processOflazerAnalyzerOutputYuret(File oflazerResult, File out) throws IOException {
    SimpleTextWriter yuretFileWriter = SimpleTextWriter.keepOpenWriter(new FileOutputStream(out), "ISO-8859-9");
    yuretFileWriter.writeLine("<DOC>\t<DOC>");
    yuretFileWriter.writeLine();
    LineIterator li = new SimpleTextReader(oflazerResult, "UTF-8").getLineIterator();
    boolean sentenceStarted = false;
    List<String> parses = new ArrayList<>();
    while (li.hasNext()) {
        String line = li.next().trim().replaceAll("AorPart", "PresPart");
        String word = Strings.subStringUntilFirst(line, "\t").trim();
        if (line.length() == 0 && !sentenceStarted) {
            continue;
        }
        if (line.length() == 0 && parses.size() > 0) {
            yuretFileWriter.writeLines(parses);
            yuretFileWriter.writeLine();
            yuretFileWriter.writeLine();
            parses = Lists.newArrayList();
        }
        if (line.length() > 0) {
            if (parses.size() == 0) {
                if (!sentenceStarted) {
                    yuretFileWriter.writeLine("<S>\t<S>");
                    yuretFileWriter.writeLine();
                }
                sentenceStarted = true;
            }
            if (punctuations.contains(word)) {
                // because analyser i use does not parse punctuations. i do it myself.
                parses.add(word + "\t" + word + "\t+Punc");
            } else if (!line.endsWith("?")) {
                parses.add(line);
            } else if (!word.equals("#")) {
                String inferred = inferUnknownWordParse(word);
                System.out.println("Bad word: [" + line + "] inferred to [" + inferred + "]");
                parses.add(inferred);
            }
        }
        if (word.equals("#")) {
            sentenceStarted = false;
            yuretFileWriter.writeLine("</S>\t</S>\n");
            parses = new ArrayList<>();
        }
    }
    yuretFileWriter.writeLine("</DOC>\t</DOC>");
    yuretFileWriter.close();
}
Also used : FileOutputStream(java.io.FileOutputStream) SimpleTextReader(zemberek.core.io.SimpleTextReader) ArrayList(java.util.ArrayList) LineIterator(zemberek.core.io.LineIterator) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Aggregations

LineIterator (zemberek.core.io.LineIterator)5 File (java.io.File)3 ArrayList (java.util.ArrayList)3 SimpleTextReader (zemberek.core.io.SimpleTextReader)2 SimpleTextWriter (zemberek.core.io.SimpleTextWriter)2 Stopwatch (com.google.common.base.Stopwatch)1 FileOutputStream (java.io.FileOutputStream)1 Ignore (org.junit.Ignore)1 Test (org.junit.Test)1 SpaceTabTokenizer (zemberek.core.SpaceTabTokenizer)1