Search in sources :

Example 1 with LanguagePair

use of edu.umd.hooka.corpora.LanguagePair in project Cloud9 by lintool.

the class CreateWordAlignmentCorpus method main.

public static void main(String[] args) {
    if (args.length != 3) {
        System.err.println("Usage: CreateWordAlignmentCorpus <lang> <infile.txt> <outfile.txt>");
        System.err.println("          (note: lang must be a two-letter ISO639 code)");
        System.exit(1);
    }
    try {
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF8"));
        Language fl = Language.languageForISO639_1(args[0]);
        LanguagePair lp = LanguagePair.languageForISO639_1Pair(args[0] + "-en");
        AlignmentWordPreprocessor sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, fl, null);
        String l;
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[2]), "UTF8"));
        while ((l = in.readLine()) != null) {
            String[] res = sawp.preprocessWordsForAlignment(l.split("\\s+"));
            boolean first = true;
            for (String r : res) {
                if (first)
                    first = false;
                else
                    out.write(' ');
                out.write(r);
            }
            out.newLine();
        }
        out.flush();
        out.close();
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) LanguagePair(edu.umd.hooka.corpora.LanguagePair) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) BufferedWriter(java.io.BufferedWriter) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 2 with LanguagePair

use of edu.umd.hooka.corpora.LanguagePair in project Cloud9 by lintool.

the class ParallelCorpusReader method convertToXMLDocument.

private static void convertToXMLDocument(String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) {
    try {
        if (readAlignments) {
            if (afile1_2 == null || afile1_2.equals(""))
                throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
        } else if (afile1_2 != null && !afile1_2.equals(""))
            throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
        BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8"));
        BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8"));
        BufferedReader r1_2 = null;
        if (readAlignments)
            r1_2 = new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8"));
        OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc);
        Language de = Language.languageForISO639_1(lf);
        Language en = Language.languageForISO639_1(le);
        LanguagePair ende = null;
        if (readAlignments)
            ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf);
        System.err.println("Reading " + en + " from: " + ifile1);
        System.err.println("Reading " + de + " from: " + ifile2);
        if (readAlignments)
            System.err.println("Reading alignments (" + ende + ") from: " + afile1_2);
        BufferedWriter w = new BufferedWriter(w1);
        w.write("<?xml version=\"1.0\" encoding=\"" + w1.getEncoding() + "\"?>");
        w.newLine();
        int x = ifile1.lastIndexOf('/');
        if (x < 0 || x >= ifile1.length())
            x = 0;
        w.write("<pdoc name=\"" + ifile1.substring(x + 1) + "\">");
        w.newLine();
        String e;
        int lc = 0;
        while ((e = r1.readLine()) != null) {
            lc += 1;
            String f = r2.readLine();
            if (f == null) {
                System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1);
                break;
            }
            String a = null;
            if (readAlignments) {
                a = r1_2.readLine();
                if (a == null)
                    System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
            }
            Chunk ec = new Chunk(e);
            Chunk fc = new Chunk(f);
            String name = label + lc;
            ParallelChunk p = new ParallelChunk();
            p.setName(name);
            p.addChunk(de, fc);
            p.addChunk(en, ec);
            if (a != null) {
                ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
                try {
                    ra.addAlignmentPointsPharaoh(a);
                    p.addReferenceAlignment(ende, ra);
                } catch (RuntimeException re) {
                    System.err.println("Couldn't set alignment points for sentence # " + lc);
                    System.err.println(" " + en + ": len=" + ec.getLength() + " words=" + ec);
                    System.err.println(" " + de + ": len=" + fc.getLength() + " words=" + fc);
                    System.err.println(" " + ende + ": " + a);
                }
            }
            w.write(p.toXML());
        }
        String t = r2.readLine();
        if (t != null)
            System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
        w.write("</pdoc>");
        System.out.println("Converted " + lc + " sentences");
        w.newLine();
        w.close();
        r1.close();
        r2.close();
        if (readAlignments)
            r1_2.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) LanguagePair(edu.umd.hooka.corpora.LanguagePair) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

Language (edu.umd.hooka.corpora.Language)2 LanguagePair (edu.umd.hooka.corpora.LanguagePair)2 BufferedReader (java.io.BufferedReader)2 BufferedWriter (java.io.BufferedWriter)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 IOException (java.io.IOException)2 InputStreamReader (java.io.InputStreamReader)2 OutputStreamWriter (java.io.OutputStreamWriter)2 ReferenceAlignment (edu.umd.hooka.alignment.aer.ReferenceAlignment)1 Chunk (edu.umd.hooka.corpora.Chunk)1 ParallelChunk (edu.umd.hooka.corpora.ParallelChunk)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 SAXException (org.xml.sax.SAXException)1