Search in sources :

Example 1 with Language

use of edu.umd.hooka.corpora.Language in project Cloud9 by lintool.

the class CreateWordAlignmentCorpus method main.

public static void main(String[] args) {
    if (args.length != 3) {
        System.err.println("Usage: CreateWordAlignmentCorpus <lang> <infile.txt> <outfile.txt>");
        System.err.println("          (note: lang must be a two-letter ISO639 code)");
        System.exit(1);
    }
    try {
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF8"));
        Language fl = Language.languageForISO639_1(args[0]);
        LanguagePair lp = LanguagePair.languageForISO639_1Pair(args[0] + "-en");
        AlignmentWordPreprocessor sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, fl, null);
        String l;
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[2]), "UTF8"));
        while ((l = in.readLine()) != null) {
            String[] res = sawp.preprocessWordsForAlignment(l.split("\\s+"));
            boolean first = true;
            for (String r : res) {
                if (first)
                    first = false;
                else
                    out.write(' ');
                out.write(r);
            }
            out.newLine();
        }
        out.flush();
        out.close();
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) LanguagePair(edu.umd.hooka.corpora.LanguagePair) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) BufferedWriter(java.io.BufferedWriter) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 2 with Language

use of edu.umd.hooka.corpora.Language in project Cloud9 by lintool.

the class ParallelCorpusReader method convertToXMLDocument.

private static void convertToXMLDocument(String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) {
    try {
        if (readAlignments) {
            if (afile1_2 == null || afile1_2.equals(""))
                throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
        } else if (afile1_2 != null && !afile1_2.equals(""))
            throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
        BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8"));
        BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8"));
        BufferedReader r1_2 = null;
        if (readAlignments)
            r1_2 = new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8"));
        OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc);
        Language de = Language.languageForISO639_1(lf);
        Language en = Language.languageForISO639_1(le);
        LanguagePair ende = null;
        if (readAlignments)
            ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf);
        System.err.println("Reading " + en + " from: " + ifile1);
        System.err.println("Reading " + de + " from: " + ifile2);
        if (readAlignments)
            System.err.println("Reading alignments (" + ende + ") from: " + afile1_2);
        BufferedWriter w = new BufferedWriter(w1);
        w.write("<?xml version=\"1.0\" encoding=\"" + w1.getEncoding() + "\"?>");
        w.newLine();
        int x = ifile1.lastIndexOf('/');
        if (x < 0 || x >= ifile1.length())
            x = 0;
        w.write("<pdoc name=\"" + ifile1.substring(x + 1) + "\">");
        w.newLine();
        String e;
        int lc = 0;
        while ((e = r1.readLine()) != null) {
            lc += 1;
            String f = r2.readLine();
            if (f == null) {
                System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1);
                break;
            }
            String a = null;
            if (readAlignments) {
                a = r1_2.readLine();
                if (a == null)
                    System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
            }
            Chunk ec = new Chunk(e);
            Chunk fc = new Chunk(f);
            String name = label + lc;
            ParallelChunk p = new ParallelChunk();
            p.setName(name);
            p.addChunk(de, fc);
            p.addChunk(en, ec);
            if (a != null) {
                ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
                try {
                    ra.addAlignmentPointsPharaoh(a);
                    p.addReferenceAlignment(ende, ra);
                } catch (RuntimeException re) {
                    System.err.println("Couldn't set alignment points for sentence # " + lc);
                    System.err.println(" " + en + ": len=" + ec.getLength() + " words=" + ec);
                    System.err.println(" " + de + ": len=" + fc.getLength() + " words=" + fc);
                    System.err.println(" " + ende + ": " + a);
                }
            }
            w.write(p.toXML());
        }
        String t = r2.readLine();
        if (t != null)
            System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
        w.write("</pdoc>");
        System.out.println("Converted " + lc + " sentences");
        w.newLine();
        w.close();
        r1.close();
        r2.close();
        if (readAlignments)
            r1_2.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) LanguagePair(edu.umd.hooka.corpora.LanguagePair) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 3 with Language

use of edu.umd.hooka.corpora.Language in project Cloud9 by lintool.

the class ParallelCorpusReader method main.

public static void main(String[] args) {
    if (false)
        try {
            parseXMLDocument("/Users/redpony/bitexts/hansards.fr-en/hansards.fr-en.xml", new PChunkCallback() {

                Random r = new Random(1);

                BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/tmp/bar.xml"), "UTF8"));

                public void handlePChunk(ParallelChunk p) {
                    Language fr = Language.languageForISO639_1("fr");
                    Language en = Language.languageForISO639_1("en");
                    Chunk f = p.getChunk(fr);
                    if (f == null)
                        return;
                    Chunk e = p.getChunk(en);
                    if (e == null)
                        return;
                    float elen = e.getLength();
                    float flen = f.getLength();
                    if (elen > 40)
                        return;
                    if (flen > 40)
                        return;
                    float ra = elen / flen;
                    if (ra > 1.3)
                        return;
                    try {
                        if (r.nextDouble() > 0.15)
                            return;
                        br.write(p.toXML());
                    } catch (Exception e1) {
                        e1.printStackTrace();
                    }
                }

                @Override
                public void finalize() {
                    try {
                        br.close();
                    } catch (Exception e) {
                    }
                }
            });
        } catch (Exception e) {
            e.printStackTrace();
        }
    if (true)
        convertToXMLDocument("koen_jhu_", "/Users/redpony/bitexts/kkn-eng-alignments/kkn.utf8", "/Users/redpony/bitexts/kkn-eng-alignments/eng", "/Users/redpony/bitexts/kkn-eng-alignments/align", "/tmp/foo.xml", "utf8", "ko", "en", true);
    if (false)
        convertToXMLDocument("eu+nc_", "/Users/redpony/bitexts/corpus.en", "/Users/redpony/bitexts/corpus.de", "", "/tmp/foo.xml", "utf8", "en", "de", false);
}
Also used : Random(java.util.Random) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter)

Aggregations

Language (edu.umd.hooka.corpora.Language)3 BufferedWriter (java.io.BufferedWriter)3 FileOutputStream (java.io.FileOutputStream)3 IOException (java.io.IOException)3 OutputStreamWriter (java.io.OutputStreamWriter)3 Chunk (edu.umd.hooka.corpora.Chunk)2 LanguagePair (edu.umd.hooka.corpora.LanguagePair)2 ParallelChunk (edu.umd.hooka.corpora.ParallelChunk)2 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 InputStreamReader (java.io.InputStreamReader)2 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)2 SAXException (org.xml.sax.SAXException)2 ReferenceAlignment (edu.umd.hooka.alignment.aer.ReferenceAlignment)1 Random (java.util.Random)1