Search in sources :

Example 1 with Chunk

use of edu.umd.hooka.corpora.Chunk in project Cloud9 by lintool.

the class ParallelCorpusReader method endElement.

public void endElement(String uri, String localName, String qName) throws SAXException {
    if (qName.equalsIgnoreCase("pchunk")) {
        pchunkCount++;
        cb_.handlePChunk(pchunk);
    } else if (qName.equalsIgnoreCase("s")) {
        String s = tempVal.toString().trim();
        if (s.length() == 0) {
            System.err.println(pchunk.getName() + ": Empty segment for lang=" + lang);
        } else {
            Chunk c = new Chunk(tempVal.toString().trim());
            pchunk.addChunk(lang, c);
            chunkCount++;
            tempVal = null;
        }
    } else if (qName.equalsIgnoreCase("wordalignment")) {
        Chunk sc = pchunk.getChunk(langpair.getSource());
        if (sc == null)
            throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getSource() + ".  Note: manual word alignment data must follow the chunk data.");
        Chunk tc = pchunk.getChunk(langpair.getTarget());
        if (tc == null)
            throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getTarget() + ".  Note: manual word alignment data must follow the chunk data.");
        ReferenceAlignment r = new ReferenceAlignment(sc.getLength(), tc.getLength());
        r.addAlignmentPointsPharaoh(tempVal.toString().trim());
        pchunk.addReferenceAlignment(langpair, r);
        refAlignCount++;
        tempVal = null;
    } else if (qName.equalsIgnoreCase("pdoc")) {
        System.err.println("Finished parsing document " + docName);
        System.err.println("  pchunks: " + pchunkCount);
        System.err.println("  chunks: " + chunkCount);
        System.err.println("  ref alignments: " + refAlignCount);
    } else {
        throw new SAXException("Unknown tag: " + qName);
    }
}
Also used : Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) SAXException(org.xml.sax.SAXException)

Example 2 with Chunk

use of edu.umd.hooka.corpora.Chunk in project Cloud9 by lintool.

the class ParallelCorpusReader method convertToXMLDocument.

private static void convertToXMLDocument(String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) {
    try {
        if (readAlignments) {
            if (afile1_2 == null || afile1_2.equals(""))
                throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
        } else if (afile1_2 != null && !afile1_2.equals(""))
            throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
        BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8"));
        BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8"));
        BufferedReader r1_2 = null;
        if (readAlignments)
            r1_2 = new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8"));
        OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc);
        Language de = Language.languageForISO639_1(lf);
        Language en = Language.languageForISO639_1(le);
        LanguagePair ende = null;
        if (readAlignments)
            ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf);
        System.err.println("Reading " + en + " from: " + ifile1);
        System.err.println("Reading " + de + " from: " + ifile2);
        if (readAlignments)
            System.err.println("Reading alignments (" + ende + ") from: " + afile1_2);
        BufferedWriter w = new BufferedWriter(w1);
        w.write("<?xml version=\"1.0\" encoding=\"" + w1.getEncoding() + "\"?>");
        w.newLine();
        int x = ifile1.lastIndexOf('/');
        if (x < 0 || x >= ifile1.length())
            x = 0;
        w.write("<pdoc name=\"" + ifile1.substring(x + 1) + "\">");
        w.newLine();
        String e;
        int lc = 0;
        while ((e = r1.readLine()) != null) {
            lc += 1;
            String f = r2.readLine();
            if (f == null) {
                System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1);
                break;
            }
            String a = null;
            if (readAlignments) {
                a = r1_2.readLine();
                if (a == null)
                    System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
            }
            Chunk ec = new Chunk(e);
            Chunk fc = new Chunk(f);
            String name = label + lc;
            ParallelChunk p = new ParallelChunk();
            p.setName(name);
            p.addChunk(de, fc);
            p.addChunk(en, ec);
            if (a != null) {
                ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
                try {
                    ra.addAlignmentPointsPharaoh(a);
                    p.addReferenceAlignment(ende, ra);
                } catch (RuntimeException re) {
                    System.err.println("Couldn't set alignment points for sentence # " + lc);
                    System.err.println(" " + en + ": len=" + ec.getLength() + " words=" + ec);
                    System.err.println(" " + de + ": len=" + fc.getLength() + " words=" + fc);
                    System.err.println(" " + ende + ": " + a);
                }
            }
            w.write(p.toXML());
        }
        String t = r2.readLine();
        if (t != null)
            System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
        w.write("</pdoc>");
        System.out.println("Converted " + lc + " sentences");
        w.newLine();
        w.close();
        r1.close();
        r2.close();
        if (readAlignments)
            r1_2.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) LanguagePair(edu.umd.hooka.corpora.LanguagePair) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 3 with Chunk

use of edu.umd.hooka.corpora.Chunk in project Cloud9 by lintool.

the class ParallelCorpusReader method main.

public static void main(String[] args) {
    if (false)
        try {
            parseXMLDocument("/Users/redpony/bitexts/hansards.fr-en/hansards.fr-en.xml", new PChunkCallback() {

                Random r = new Random(1);

                BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/tmp/bar.xml"), "UTF8"));

                public void handlePChunk(ParallelChunk p) {
                    Language fr = Language.languageForISO639_1("fr");
                    Language en = Language.languageForISO639_1("en");
                    Chunk f = p.getChunk(fr);
                    if (f == null)
                        return;
                    Chunk e = p.getChunk(en);
                    if (e == null)
                        return;
                    float elen = e.getLength();
                    float flen = f.getLength();
                    if (elen > 40)
                        return;
                    if (flen > 40)
                        return;
                    float ra = elen / flen;
                    if (ra > 1.3)
                        return;
                    try {
                        if (r.nextDouble() > 0.15)
                            return;
                        br.write(p.toXML());
                    } catch (Exception e1) {
                        e1.printStackTrace();
                    }
                }

                @Override
                public void finalize() {
                    try {
                        br.close();
                    } catch (Exception e) {
                    }
                }
            });
        } catch (Exception e) {
            e.printStackTrace();
        }
    if (true)
        convertToXMLDocument("koen_jhu_", "/Users/redpony/bitexts/kkn-eng-alignments/kkn.utf8", "/Users/redpony/bitexts/kkn-eng-alignments/eng", "/Users/redpony/bitexts/kkn-eng-alignments/align", "/tmp/foo.xml", "utf8", "ko", "en", true);
    if (false)
        convertToXMLDocument("eu+nc_", "/Users/redpony/bitexts/corpus.en", "/Users/redpony/bitexts/corpus.de", "", "/tmp/foo.xml", "utf8", "en", "de", false);
}
Also used : Random(java.util.Random) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter)

Aggregations

Chunk (edu.umd.hooka.corpora.Chunk)3 ParallelChunk (edu.umd.hooka.corpora.ParallelChunk)3 SAXException (org.xml.sax.SAXException)3 ReferenceAlignment (edu.umd.hooka.alignment.aer.ReferenceAlignment)2 Language (edu.umd.hooka.corpora.Language)2 BufferedWriter (java.io.BufferedWriter)2 FileOutputStream (java.io.FileOutputStream)2 IOException (java.io.IOException)2 OutputStreamWriter (java.io.OutputStreamWriter)2 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)2 LanguagePair (edu.umd.hooka.corpora.LanguagePair)1 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 Random (java.util.Random)1