Search in sources :

Example 1 with ParallelChunk

use of edu.umd.hooka.corpora.ParallelChunk in project Cloud9 by lintool.

the class ParallelCorpusReader method startElement.

//Event Handlers
public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws SAXException {
    //reset
    if (qName.equalsIgnoreCase("pchunk")) {
        pchunk = new ParallelChunk();
        pchunk.setName(attributes.getValue("name"));
    } else if (qName.equalsIgnoreCase("s")) {
        lang = Language.languageForISO639_1(attributes.getValue("lang"));
        tempVal = new StringBuffer();
    } else if (qName.equalsIgnoreCase("wordalignment")) {
        tempVal = new StringBuffer();
        langpair = LanguagePair.languageForISO639_1Pair(attributes.getValue("langpair"));
    } else if (qName.equalsIgnoreCase("pdoc")) {
        docName = attributes.getValue("name");
    } else {
        throw new SAXException("Unknown tag: " + qName);
    }
}
Also used : ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) SAXException(org.xml.sax.SAXException)

Example 2 with ParallelChunk

use of edu.umd.hooka.corpora.ParallelChunk in project Cloud9 by lintool.

the class ParallelCorpusReader method convertToXMLDocument.

private static void convertToXMLDocument(String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) {
    try {
        if (readAlignments) {
            if (afile1_2 == null || afile1_2.equals(""))
                throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!");
        } else if (afile1_2 != null && !afile1_2.equals(""))
            throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!");
        BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8"));
        BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8"));
        BufferedReader r1_2 = null;
        if (readAlignments)
            r1_2 = new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8"));
        OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc);
        Language de = Language.languageForISO639_1(lf);
        Language en = Language.languageForISO639_1(le);
        LanguagePair ende = null;
        if (readAlignments)
            ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf);
        System.err.println("Reading " + en + " from: " + ifile1);
        System.err.println("Reading " + de + " from: " + ifile2);
        if (readAlignments)
            System.err.println("Reading alignments (" + ende + ") from: " + afile1_2);
        BufferedWriter w = new BufferedWriter(w1);
        w.write("<?xml version=\"1.0\" encoding=\"" + w1.getEncoding() + "\"?>");
        w.newLine();
        int x = ifile1.lastIndexOf('/');
        if (x < 0 || x >= ifile1.length())
            x = 0;
        w.write("<pdoc name=\"" + ifile1.substring(x + 1) + "\">");
        w.newLine();
        String e;
        int lc = 0;
        while ((e = r1.readLine()) != null) {
            lc += 1;
            String f = r2.readLine();
            if (f == null) {
                System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1);
                break;
            }
            String a = null;
            if (readAlignments) {
                a = r1_2.readLine();
                if (a == null)
                    System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences");
            }
            Chunk ec = new Chunk(e);
            Chunk fc = new Chunk(f);
            String name = label + lc;
            ParallelChunk p = new ParallelChunk();
            p.setName(name);
            p.addChunk(de, fc);
            p.addChunk(en, ec);
            if (a != null) {
                ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength());
                try {
                    ra.addAlignmentPointsPharaoh(a);
                    p.addReferenceAlignment(ende, ra);
                } catch (RuntimeException re) {
                    System.err.println("Couldn't set alignment points for sentence # " + lc);
                    System.err.println(" " + en + ": len=" + ec.getLength() + " words=" + ec);
                    System.err.println(" " + de + ": len=" + fc.getLength() + " words=" + fc);
                    System.err.println(" " + ende + ": " + a);
                }
            }
            w.write(p.toXML());
        }
        String t = r2.readLine();
        if (t != null)
            System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1);
        w.write("</pdoc>");
        System.out.println("Converted " + lc + " sentences");
        w.newLine();
        w.close();
        r1.close();
        r2.close();
        if (readAlignments)
            r1_2.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) LanguagePair(edu.umd.hooka.corpora.LanguagePair) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) ReferenceAlignment(edu.umd.hooka.alignment.aer.ReferenceAlignment) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter)

Example 3 with ParallelChunk

use of edu.umd.hooka.corpora.ParallelChunk in project Cloud9 by lintool.

the class ParallelCorpusReader method main.

public static void main(String[] args) {
    if (false)
        try {
            parseXMLDocument("/Users/redpony/bitexts/hansards.fr-en/hansards.fr-en.xml", new PChunkCallback() {

                Random r = new Random(1);

                BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/tmp/bar.xml"), "UTF8"));

                public void handlePChunk(ParallelChunk p) {
                    Language fr = Language.languageForISO639_1("fr");
                    Language en = Language.languageForISO639_1("en");
                    Chunk f = p.getChunk(fr);
                    if (f == null)
                        return;
                    Chunk e = p.getChunk(en);
                    if (e == null)
                        return;
                    float elen = e.getLength();
                    float flen = f.getLength();
                    if (elen > 40)
                        return;
                    if (flen > 40)
                        return;
                    float ra = elen / flen;
                    if (ra > 1.3)
                        return;
                    try {
                        if (r.nextDouble() > 0.15)
                            return;
                        br.write(p.toXML());
                    } catch (Exception e1) {
                        e1.printStackTrace();
                    }
                }

                @Override
                public void finalize() {
                    try {
                        br.close();
                    } catch (Exception e) {
                    }
                }
            });
        } catch (Exception e) {
            e.printStackTrace();
        }
    if (true)
        convertToXMLDocument("koen_jhu_", "/Users/redpony/bitexts/kkn-eng-alignments/kkn.utf8", "/Users/redpony/bitexts/kkn-eng-alignments/eng", "/Users/redpony/bitexts/kkn-eng-alignments/align", "/tmp/foo.xml", "utf8", "ko", "en", true);
    if (false)
        convertToXMLDocument("eu+nc_", "/Users/redpony/bitexts/corpus.en", "/Users/redpony/bitexts/corpus.de", "", "/tmp/foo.xml", "utf8", "en", "de", false);
}
Also used : Random(java.util.Random) Language(edu.umd.hooka.corpora.Language) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) Chunk(edu.umd.hooka.corpora.Chunk) ParallelChunk(edu.umd.hooka.corpora.ParallelChunk) IOException(java.io.IOException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXException(org.xml.sax.SAXException) BufferedWriter(java.io.BufferedWriter)

Aggregations

ParallelChunk (edu.umd.hooka.corpora.ParallelChunk)3 SAXException (org.xml.sax.SAXException)3 Chunk (edu.umd.hooka.corpora.Chunk)2 Language (edu.umd.hooka.corpora.Language)2 BufferedWriter (java.io.BufferedWriter)2 FileOutputStream (java.io.FileOutputStream)2 IOException (java.io.IOException)2 OutputStreamWriter (java.io.OutputStreamWriter)2 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)2 ReferenceAlignment (edu.umd.hooka.alignment.aer.ReferenceAlignment)1 LanguagePair (edu.umd.hooka.corpora.LanguagePair)1 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 Random (java.util.Random)1