Search in sources :

Example 6 with Bible

use of biblemulticonverter.data.Bible in project BibleMultiConverter by schierlm.

the class SWORD method doImport.

protected Bible doImport(Book book) throws Exception {
    OSISHelper helper = new OSISHelper();
    Bible result = new Bible(book.getName());
    TransformerHandler th = ((SAXTransformerFactory) SAXTransformerFactory.newInstance()).newTransformerHandler();
    Map<BookID, biblemulticonverter.data.Book> parsedBooks = new EnumMap<>(BookID.class);
    Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
    List<Verse> allVerses = new ArrayList<>();
    Verse nextCandidate = null;
    for (Iterator<?> iter = book.getGlobalKeyList().iterator(); iter.hasNext(); ) {
        Verse v = (Verse) iter.next();
        while (nextCandidate != null && !nextCandidate.equals(v)) {
            if (book.contains(nextCandidate)) {
                System.out.println("WARNING: Verse (after) skipped by iterator: " + nextCandidate);
                allVerses.add(nextCandidate);
            }
            nextCandidate = nextCandidate.getVersification().add(nextCandidate, 1);
        }
        Verse prevCandidate = v.getVersification().subtract(v, 1);
        List<Verse> versesSkippedBefore = new ArrayList<>();
        while (prevCandidate != null && !allVerses.contains(prevCandidate) && !versesSkippedBefore.contains(prevCandidate)) {
            versesSkippedBefore.add(0, prevCandidate);
            prevCandidate = prevCandidate.getVersification().subtract(prevCandidate, 1);
        }
        for (Verse vv : versesSkippedBefore) {
            if (book.contains(vv)) {
                System.out.println("WARNING: Verse (before) skipped by iterator: " + vv);
                allVerses.add(vv);
            }
        }
        allVerses.add(v);
        nextCandidate = v.getVersification().add(v, 1);
    }
    while (nextCandidate != null) {
        if (book.contains(nextCandidate)) {
            System.out.println("WARNING: Verse (at end) skipped by iterator: " + nextCandidate);
            allVerses.add(nextCandidate);
        }
        Verse nextNextCandidate = nextCandidate.getVersification().add(nextCandidate, 1);
        if (nextCandidate.equals(nextNextCandidate))
            break;
        nextCandidate = nextNextCandidate;
    }
    for (Verse v : allVerses) {
        BookID bkid = biblemulticonverter.sword.BookMapping.MAPPING.get(v.getBook());
        biblemulticonverter.data.Book bk = parsedBooks.get(bkid);
        if (!parsedBooks.containsKey(bkid)) {
            bk = new biblemulticonverter.data.Book(bkid.getOsisID().replace("x-Intr", "Intr"), bkid, bkid.getEnglishName(), bkid.getEnglishName());
            parsedBooks.put(bkid, bk);
            bk.getChapters().add(new Chapter());
            result.getBooks().add(bk);
        }
        int chapterNum = v.getChapter(), verseNum = v.getVerse();
        while (bk.getChapters().size() < chapterNum) bk.getChapters().add(new Chapter());
        Chapter chapter = bk.getChapters().get(chapterNum == 0 ? 0 : chapterNum - 1);
        FormattedText verse;
        if (verseNum == 0) {
            verse = new FormattedText();
            if (chapter.getProlog() != null) {
                chapter.getProlog().accept(verse.getAppendVisitor());
            }
            chapter.setProlog(verse);
        } else {
            if (chapterNum == 0)
                throw new IllegalStateException("Verse " + verseNum + " in chapter 0 is invalid");
            verse = new biblemulticonverter.data.Verse("" + verseNum);
            chapter.getVerses().add((biblemulticonverter.data.Verse) verse);
        }
        Element root = doc.createElement("verse");
        th.setResult(new DOMResult(root));
        new BookData(book, v).getSAXEventProvider().provideSAXEvents(th);
        if (root.getChildNodes().getLength() == 1 && root.getFirstChild() instanceof Element && root.getFirstChild().getNodeName().equals("div") && root.getFirstChild().getChildNodes().getLength() >= 1 && root.getFirstChild().getFirstChild().getNodeName().equals("title")) {
            Element div = (Element) root.getFirstChild();
            root.removeChild(div);
            div.removeChild(div.getFirstChild());
            while (div.getFirstChild() != null) {
                Node child = div.getFirstChild();
                div.removeChild(child);
                root.appendChild(child);
            }
        } else {
            throw new RuntimeException("Unexpected OSIS structure!");
        }
        helper.handleVerse(root, verse);
        if (verse.getElementTypes(1).length() == 0) {
            System.out.println("WARNING: Empty verse " + bk.getAbbr() + " " + chapterNum + ":" + verseNum);
            if (verse instanceof biblemulticonverter.data.Verse)
                chapter.getVerses().remove(verse);
            else
                chapter.setProlog(null);
        }
    }
    for (biblemulticonverter.data.Book bk : parsedBooks.values()) {
        while (!bk.getChapters().isEmpty()) {
            Chapter ch = bk.getChapters().get(bk.getChapters().size() - 1);
            if (ch.getProlog() == null && ch.getVerses().isEmpty()) {
                bk.getChapters().remove(ch);
            } else {
                break;
            }
        }
        if (bk.getChapters().isEmpty()) {
            result.getBooks().remove(bk);
        }
    }
    return result;
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) DOMResult(javax.xml.transform.dom.DOMResult) Bible(biblemulticonverter.data.Bible) Element(org.w3c.dom.Element) Node(org.w3c.dom.Node) ArrayList(java.util.ArrayList) Document(org.w3c.dom.Document) BookID(biblemulticonverter.data.BookID) Book(org.crosswire.jsword.book.Book) BookData(org.crosswire.jsword.book.BookData) EnumMap(java.util.EnumMap) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) Chapter(biblemulticonverter.data.Chapter) FormattedText(biblemulticonverter.data.FormattedText) Verse(org.crosswire.jsword.passage.Verse)

Example 7 with Bible

use of biblemulticonverter.data.Bible in project BibleMultiConverter by schierlm.

the class NeUeParser method doImport.

@Override
public Bible doImport(File inputDirectory) throws Exception {
    Bible bible = new Bible("NeÜ bibel.heute (Neue evangelistische Übersetzung)");
    MetadataBook metadata = new MetadataBook();
    metadata.setValue(MetadataBookKey.description, "Neue evangelistische Übersetzung (NeÜ), eine Übertragung der Bibel ins heutige Deutsch.");
    metadata.setValue(MetadataBookKey.rights, "Copyright (c) Karl-Heinz Vanheiden, Ahornweg 3, 07926 Gefell. Sofern keine anderslautende schriftliche Genehmigung des Rechteinhabers vorliegt, darf dieses Werk zu privaten und gemeindlichen Zwecken verwendet, aber nicht verändert oder weitergegeben werden. " + "Eine Weitergabe auf körperlichen Datenträgern (Papier, CD, DVD, Stick o.ä.) bedarf zusätzlich einer Genehmigung der Christlichen Verlagsgesellschaft Dillenburg (http://cv-dillenburg.de/).");
    metadata.setValue(MetadataBookKey.source, "http://www.derbibelvertrauen.de/");
    metadata.setValue(MetadataBookKey.publisher, "Karl-Heinz Vanheiden");
    metadata.setValue(MetadataBookKey.language, "GER");
    bible.getBooks().add(metadata.getBook());
    String mainFile = "NeUe.htm";
    if (!new File(inputDirectory, mainFile).exists())
        mainFile = "index.htm";
    try (BufferedReader br = createReader(inputDirectory, mainFile)) {
        String line = br.readLine().trim();
        while (!line.startsWith("<p class=\"u3\">")) {
            if (line.contains("Textstand: ")) {
                line = line.substring(line.indexOf("Textstand: ") + 11);
                line = line.substring(0, line.indexOf('<'));
                metadata.setValue(MetadataBookKey.version, line);
                metadata.setValue(MetadataBookKey.date, new SimpleDateFormat("yyyy-MM-dd").format(new Date()));
                metadata.setValue(MetadataBookKey.revision, line.replaceAll("[^0-9]+", ""));
                metadata.finished();
            }
            line = br.readLine().trim();
        }
        Pattern tocPattern = Pattern.compile("<a href=\"([^\"]+)\">([^<>]+)</a>&nbsp;&nbsp;(?:</p>)?");
        int bookIndex = 0, jcIndex = 0;
        while (!line.startsWith("<a name=\"vorwort\">")) {
            if (line.equals("<br>")) {
                line = br.readLine().trim();
                if (line.startsWith("&raquo;&raquo;&nbsp;&nbsp;"))
                    line = line.substring("&raquo;&raquo;&nbsp;&nbsp;".length());
            }
            Matcher m = tocPattern.matcher(line);
            if (m.matches()) {
                String url = m.group(1);
                String shortName = replaceEntities(m.group(2));
                if (url.endsWith(".html#bb")) {
                    String filename = url.substring(0, url.length() - 8);
                    BookMetadata bm = METADATA[bookIndex];
                    if (!bm.filename.equals(filename))
                        throw new IOException(filename + "/" + bm.filename);
                    bm.shortname = shortName;
                    bookIndex++;
                } else if (url.startsWith("0")) {
                    if (!url.equals(JESUS_CHRONIK[jcIndex] + ".html"))
                        throw new IOException(url + "/" + JESUS_CHRONIK[jcIndex]);
                    jcIndex++;
                } else {
                    throw new IOException(url);
                }
            } else if (line.length() != 0 && !line.startsWith("<p class=\"u3\">") && !line.startsWith("///") && !line.equals("<p>&nbsp;</p>") && !line.equals("<p><a name=\"bb\">&nbsp;</a></p>")) {
                throw new IOException(line);
            }
            line = br.readLine().trim();
        }
        if (bookIndex != METADATA.length)
            throw new IOException(bookIndex + " != " + METADATA.length);
        if (jcIndex == 0)
            JESUS_CHRONIK = new String[0];
        if (jcIndex != JESUS_CHRONIK.length)
            throw new IOException(jcIndex + " != " + JESUS_CHRONIK.length);
        // Vorwort
        Book vorwort = new Book("Vorwort", BookID.INTRODUCTION, "Vorwort", "Vorwort des Übersetzers");
        bible.getBooks().add(vorwort);
        Visitor<RuntimeException> vv = getPrologVisitor(vorwort);
        boolean needParagraph = false;
        if (line.endsWith("</a><br>"))
            line = br.readLine().trim();
        while (!line.startsWith("<div align=\"right\">")) {
            line = line.replaceAll("<a name=\"[a-z]+\"></a>", "");
            if (line.startsWith("<h2>")) {
                if (!vorwort.getLongName().equals(replaceEntities(cutAffix(line, "<h2>", "</h2>"))))
                    throw new IOException(replaceEntities(cutAffix(line, "<h2>", "</h2>")));
            } else if (line.startsWith("<h4>")) {
                parseFormattedText(vv.visitHeadline(1), cutAffix(line, "<h4>", "</h4>"), null, null);
                needParagraph = false;
            } else if (line.startsWith("<h4 id=")) {
                parseFormattedText(vv.visitHeadline(1), cutAffix(line.replaceFirst("<h4 id=\"[a-z]+\">(</a>)?", ""), "<a href=\"#vorwort\"> /^\\</a> ", "</h4>"), null, null);
                needParagraph = false;
            } else if (line.startsWith("<div class=\"fn\">")) {
                if (needParagraph)
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                needParagraph = true;
                parseFormattedText(vv.visitFormattingInstruction(FormattingInstructionKind.ITALIC), cutAffix(line, "<div class=\"fn\">", "</div>"), null, null);
            } else if (line.startsWith("<p>")) {
                if (needParagraph)
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                needParagraph = true;
                if (line.endsWith("<br />"))
                    line += br.readLine().trim();
                parseFormattedText(vv, cutAffix(line, "<p>", "</p>"), null, null);
            } else if (line.equals("<ul>")) {
                while (!line.equals("</ul>")) {
                    line = br.readLine();
                }
            } else {
                throw new IOException(line);
            }
            line = skipLines(br, "<p>&nbsp;</p>");
        }
        vorwort.getChapters().get(0).getProlog().finished();
    }
    for (BookMetadata bm : METADATA) {
        if (!new File(inputDirectory, bm.filename + ".html").exists()) {
            System.out.println("*** Skipping " + bm.filename + " - file not found ***");
            continue;
        }
        try (BufferedReader br = createReader(inputDirectory, bm.filename + ".html")) {
            String line = br.readLine().trim();
            line = skipLines(br, "<html>", "<head>", "<title>", "<meta ", "<link ", "</head>", "<body>", "<div style=\"background-color: #DCC2A0;\">", "<table border=", "<tbody ", "<tr><td>", "<p class=\"u3\">", "<a href=\"", "\\\\\\", "<br>", "&raquo;&raquo;");
            if (!line.equals("<p><a name=\"bb\">&nbsp;</a></p>") && !line.equals("<p><a id=\"bb\">&nbsp;</a></p>"))
                throw new IOException(line);
            line = skipLines(br);
            if (line.equals("<p>&nbsp;</p>"))
                line = br.readLine().trim();
            Book bk = new Book(bm.abbr, bm.id, bm.shortname, replaceEntities(cutAffix(line, "<h1>", "</h1>")));
            bible.getBooks().add(bk);
            line = skipLines(br, "<p class=\"u3\">", "<a href=\"#", "</p>", "<p>&nbsp;</p>");
            FormattedText prolog = new FormattedText();
            prolog.getAppendVisitor().visitHeadline(1).visitText(replaceEntities(cutAffix(line, "<p class=\"u0\">", "</p>")));
            line = skipLines(br);
            boolean firstProlog = true;
            while (line.startsWith("<div class=\"e\">") && line.endsWith("</div>")) {
                if (firstProlog) {
                    firstProlog = false;
                } else {
                    prolog.getAppendVisitor().visitLineBreak(LineBreakKind.PARAGRAPH);
                }
                parseFormattedText(prolog.getAppendVisitor(), cutAffix(line, "<div class=\"e\">", "</div>"), bm, null);
                line = skipLines(br);
            }
            if (firstProlog)
                throw new IOException(line);
            prolog.getAppendVisitor().visitLineBreak(LineBreakKind.PARAGRAPH);
            parseFormattedText(prolog.getAppendVisitor().visitFormattingInstruction(FormattingInstructionKind.BOLD).visitFormattingInstruction(FormattingInstructionKind.ITALIC), cutAffix(line, "<p class=\"u1\">", "</p>"), bm, null);
            prolog.finished();
            line = skipLines(br);
            if (!line.startsWith("<h"))
                throw new IOException(line);
            char minHeadline = line.charAt(2);
            List<Headline> headlines = new ArrayList<>();
            boolean inParagraph = false;
            Chapter currentChapter = null;
            Verse currentVerse = null;
            List<Visitor<RuntimeException>> footnotes = new ArrayList<>();
            List<String> footnoteVerses = new ArrayList<>();
            while (!line.equals("<hr>")) {
                if (line.startsWith("<p>&nbsp;</p>")) {
                    line = line.substring(13).trim();
                    if (line.length() == 0)
                        line = skipLines(br);
                    continue;
                }
                String restLine = null;
                List<Visitor<RuntimeException>> newFootnotes = new ArrayList<>();
                while (line.matches("<[a-z0-9]+ (class=\"[^\"]+\" )?id=\"[a-z0-9]+\"[> ].*")) line = line.replaceFirst(" id=\"[a-z0-9]+\"", "");
                if (line.startsWith("<p class=\"poet\">") || line.startsWith("<p class=\"einl\">")) {
                    line = "<p>" + line.substring(16);
                }
                if (line.matches(".*</p>.+")) {
                    int pos = line.indexOf("</p>");
                    restLine = line.substring(pos + 4).trim();
                    line = line.substring(0, pos + 4);
                }
                if (!inParagraph && line.startsWith("<p>")) {
                    inParagraph = true;
                    line = line.substring(3).trim();
                    if (line.length() == 0) {
                        line = skipLines(br);
                        continue;
                    }
                }
                if (line.indexOf("<span class=\"vers\">", 1) != -1) {
                    int pos = line.indexOf("<span class=\"vers\">", 1);
                    restLine = line.substring(pos) + (restLine == null ? "" : restLine);
                    line = line.substring(0, pos).trim();
                }
                if (line.indexOf("<p class=\"poet\">", 1) != -1) {
                    int pos = line.indexOf("<p class=\"poet\">", 1);
                    restLine = line.substring(pos) + (restLine == null ? "" : restLine);
                    line = line.substring(0, pos).trim();
                }
                while (line.endsWith("&nbsp;")) line = line.substring(0, line.length() - 6);
                if (!inParagraph && (line.startsWith("<h2>") || line.startsWith("<h3>") || line.startsWith("<h4>"))) {
                    Headline hl = new Headline(line.charAt(2) - minHeadline + 1);
                    String headline = cutAffix(line, line.substring(0, 4), "</" + line.substring(1, 4));
                    if (headline.contains("*"))
                        throw new IOException(headline);
                    hl.getAppendVisitor().visitText(replaceEntities(headline));
                    headlines.add(hl);
                } else if (inParagraph && line.startsWith("<span class=\"vers\">")) {
                    int pos = line.indexOf("</span>");
                    if (pos == -1)
                        throw new IOException(line);
                    String vs = line.substring(19, pos).trim();
                    if (vs.endsWith("&nbsp;")) {
                        vs = cutAffix(vs, "", "&nbsp;");
                    }
                    if (vs.matches("[0-9]+(,[0-9]+)?")) {
                        currentVerse = new Verse(vs);
                    } else {
                        throw new IOException(vs);
                    }
                    line = line.substring(pos + 7);
                    if (line.endsWith("</p>")) {
                        inParagraph = false;
                        line = line.substring(0, line.length() - 4);
                    }
                    line = line.trim();
                    if (line.startsWith("&nbsp;")) {
                        line = line.substring(6);
                    }
                    for (Headline h : headlines) {
                        h.accept(currentVerse.getAppendVisitor().visitHeadline(h.getDepth()));
                    }
                    headlines.clear();
                    parseFormattedText(currentVerse.getAppendVisitor(), line, bm, newFootnotes);
                    if (!inParagraph)
                        currentVerse.getAppendVisitor().visitLineBreak(LineBreakKind.PARAGRAPH);
                    currentChapter.getVerses().add(currentVerse);
                } else if (inParagraph && line.startsWith("<a href=\"#top\"><span class=\"kap\">")) {
                    int chap = Integer.parseInt(cutAffix(line, "<a href=\"#top\"><span class=\"kap\">", "</span></a>"));
                    currentChapter = new Chapter();
                    currentVerse = null;
                    bk.getChapters().add(currentChapter);
                    if (chap != bk.getChapters().size())
                        throw new IOException(chap + "/" + bk.getChapters().size());
                    if (prolog != null) {
                        currentChapter.setProlog(prolog);
                        prolog = null;
                    }
                } else if (!inParagraph && line.startsWith("<div class=\"fn\">")) {
                    String content = cutAffix(line, "<div class=\"fn\">", "</div>");
                    if (footnoteVerses.size() == 0)
                        throw new IOException(line);
                    String prefix = footnoteVerses.remove(0) + ":";
                    if (!content.startsWith(prefix)) {
                        throw new IOException(prefix + " / " + content);
                    }
                    parseFormattedText(footnotes.remove(0), content.substring(prefix.length()).trim(), bm, null);
                } else if (inParagraph && !line.isEmpty() && (!line.startsWith("<") && !line.startsWith("&nbsp;") || line.startsWith("<span class=\"u2\">"))) {
                    if (line.endsWith("</p>")) {
                        inParagraph = false;
                        line = line.substring(0, line.length() - 4);
                    }
                    line = line.trim();
                    parseFormattedText(currentVerse.getAppendVisitor(), line, bm, newFootnotes);
                    if (!inParagraph)
                        currentVerse.getAppendVisitor().visitLineBreak(LineBreakKind.PARAGRAPH);
                } else {
                    System.err.println("Next line: " + br.readLine());
                    throw new IOException(line);
                }
                if (!newFootnotes.isEmpty()) {
                    footnotes.addAll(newFootnotes);
                    for (int i = 0; i < newFootnotes.size(); i++) {
                        if (currentVerse.getNumber().contains(",")) {
                            footnoteVerses.add(currentVerse.getNumber());
                        } else {
                            footnoteVerses.add(bk.getChapters().size() + "," + currentVerse.getNumber());
                        }
                    }
                }
                if (restLine != null)
                    line = restLine;
                else
                    line = skipLines(br);
            }
            if (!headlines.isEmpty())
                throw new IOException("" + headlines.size());
            if (!footnotes.isEmpty() || !footnoteVerses.isEmpty())
                throw new IOException(footnotes.size() + "/" + footnoteVerses.size());
            for (Chapter ch : bk.getChapters()) {
                for (Verse vv : ch.getVerses()) {
                    vv.trimWhitespace();
                    vv.finished();
                }
            }
        }
    }
    // Anhang
    Book anhang = new Book("Anhang", BookID.APPENDIX, "Anhang", "Anhang");
    bible.getBooks().add(anhang);
    Visitor<RuntimeException> vv = getPrologVisitor(anhang);
    vv.visitHeadline(1).visitText("Ausblick auf die ganze Bibel");
    try (BufferedReader br = createReader(inputDirectory, "bibel.html")) {
        String line = br.readLine().trim();
        while (!line.startsWith("<a name=\"at\">")) {
            line = br.readLine().trim();
        }
        while (!line.equals("</body>")) {
            line = line.replaceAll("<a name=\"[a-z]+\"></a>", "");
            line = line.replaceAll("> +<", "><");
            line = line.replace("<td valign=\"top\"><br /><br /><a href", "<td valign=\"top\"><a href");
            if (line.startsWith("<h2>")) {
                parseFormattedText(vv.visitHeadline(2), cutAffix(line, "<h2>", "</h2>"), null, null);
            } else if (line.startsWith("<a href=\"#top\"><h2>")) {
                parseFormattedText(vv.visitHeadline(2), cutAffix(line, "<a href=\"#top\"><h2>", "</h2></a>"), null, null);
            } else if (line.startsWith("<h3>")) {
                parseFormattedText(vv.visitHeadline(3), cutAffix(line, "<h3>", "</h3>"), null, null);
            } else if (line.startsWith("<a href=\"#top\"><h3>")) {
                parseFormattedText(vv.visitHeadline(3), cutAffix(line, "<a href=\"#top\"><h3>", "</h3></a>"), null, null);
            } else if (line.startsWith("<td valign=\"top\"><a href=\"")) {
                String[] parts = cutAffix(line, "<td valign=\"top\"><a href=\"", "</a></td>").split(".html\">", 2);
                line = br.readLine().trim().replaceAll("> +<", "><").replace("html#u", "html");
                if (line.contains("<td><br /><br /><a href")) {
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                    line = line.replace("<td><br /><br /><a href", "<td><a href");
                }
                String title = cutAffix(line, "<td><a href=\"" + parts[0] + ".html\">", "</a><br />");
                Visitor<RuntimeException> bold = vv.visitFormattingInstruction(FormattingInstructionKind.BOLD);
                BookMetadata m = null;
                for (BookMetadata bm : METADATA) {
                    if (bm.filename.equals(parts[0])) {
                        m = bm;
                        break;
                    }
                }
                bold.visitCrossReference(m.abbr, m.id, 1, "1", 1, "1").visitText(replaceEntities(parts[1].replace("-", "")));
                bold.visitText(" " + replaceEntities(title));
                vv.visitLineBreak(LineBreakKind.NEWLINE);
                line = br.readLine().trim();
                while (!line.endsWith("</td>")) line += " " + br.readLine().trim();
                vv.visitText(replaceEntities(cutAffix(line, "", "</td>")));
                vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                line = br.readLine().trim();
                if (!line.equals("</tr>"))
                    throw new IOException(line);
            } else {
                throw new IOException(line);
            }
            line = skipLines(br, "<table border=\"0\" width=\"350\">", "<colgroup>", "<p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p><p>&nbsp;</p>", "<p>&nbsp;</p>", "</div", "</td></tr>", "</tbody>", "</colgroup>", "<col ", "<tr>", "</table>");
        }
    }
    // Hesekiels Tempel
    vv.visitHeadline(1).visitText("Hesekiels Tempel");
    Visitor<RuntimeException> vvv = vv.visitFormattingInstruction(FormattingInstructionKind.LINK);
    vvv.visitRawHTML(RawHTMLMode.OFFLINE, "<a href=\"http://www.alt.kh-vanheiden.de/NeUe/Bibeltexte/Hesekiels%20Tempel.gif\" target=\"_blank\">");
    vvv.visitFormattingInstruction(FormattingInstructionKind.BOLD).visitText("Rekonstruktionszeichnung");
    vvv.visitRawHTML(RawHTMLMode.OFFLINE, "</a>");
    vv.visitRawHTML(RawHTMLMode.ONLINE, "<br /><img src=\"http://www.alt.kh-vanheiden.de/NeUe/Bibeltexte/Hesekiels%20Tempel.gif\" width=\"640\" height=\"635\">");
    // Jesus-Chronik
    if (JESUS_CHRONIK.length > 0)
        vv.visitHeadline(1).visitText("Die Jesus-Chronik");
    for (String name : JESUS_CHRONIK) {
        if (!new File(inputDirectory, name + ".html").exists()) {
            System.out.println("*** Skipping " + name + " - file not found ***");
            continue;
        }
        try (BufferedReader br = createReader(inputDirectory, name + ".html")) {
            String line = skipLines(br, "<html>", "<head>", "<title> Die Jesus-Biografie</title>", "<link rel=\"stylesheet\" type=\"text/css\" href=\"styles.css\">", "</head>", "<body>");
            List<Visitor<RuntimeException>> footnoteList = new ArrayList<>();
            List<String> footnotePrefixes = new ArrayList<>();
            while (!line.startsWith("</body>")) {
                line = line.replaceAll("<a name=\"[a-z]+\"></a>", "");
                if (line.startsWith("<h2>")) {
                    parseFormattedText(vv.visitHeadline(2), cutAffix(line, "<h2>", "</h2>"), null, null);
                } else if (line.startsWith("<div class=\"fn\">")) {
                    while (!line.endsWith("</div>")) line += " " + br.readLine().trim();
                    String[] fns = cutAffix(line, "<div class=\"fn\">", "</div>").split("<br />");
                    for (String fn : fns) {
                        fn = fn.trim();
                        String pfx = footnotePrefixes.remove(0);
                        Visitor<RuntimeException> fnv = footnoteList.remove(0);
                        if (!fn.startsWith(pfx))
                            throw new IOException(pfx + " / " + fn);
                        parseFormattedText(fnv, cutAffix(fn, pfx, ""), null, null);
                    }
                } else if (line.startsWith("<p><div class=\"rot\">")) {
                    String text = cutAffix(line, "<p><div class=\"rot\">", "<!--/DATE--></div></p>").replace("<!--DATE-->", "");
                    parseFormattedText(vv.visitFormattingInstruction(FormattingInstructionKind.ITALIC), text, null, null);
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                } else if (line.startsWith("<p><b>") && line.contains("</b><br />")) {
                    int pos = line.indexOf("</b><br />");
                    parseJesusChronikText(vv.visitHeadline(3), line.substring(6, pos), footnotePrefixes, footnoteList);
                    String xref = cutAffix(line.substring(pos), "</b><br />", "</p>");
                    if (!xref.isEmpty())
                        parseJesusChronikText(vv.visitFormattingInstruction(FormattingInstructionKind.ITALIC), xref, footnotePrefixes, footnoteList);
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                } else if (line.startsWith("<p>")) {
                    parseJesusChronikText(vv, cutAffix(line, "<p>", "</p>"), footnotePrefixes, footnoteList);
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                } else if (line.startsWith("&copy;")) {
                    while (!line.endsWith("</div>")) line += " " + br.readLine().trim();
                    parseFormattedText(vv, cutAffix(line, "", "</div>"), null, null);
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                } else if (line.startsWith("<div class=\"e\">")) {
                    while (!line.endsWith("</div>")) line += " " + br.readLine().trim();
                    parseFormattedText(vv.visitFormattingInstruction(FormattingInstructionKind.ITALIC), cutAffix(line, "<div class=\"e\">", "</div>"), null, null);
                    vv.visitLineBreak(LineBreakKind.PARAGRAPH);
                } else {
                    throw new IOException(line);
                }
                line = skipLines(br);
            }
            if (!footnoteList.isEmpty() || !footnotePrefixes.isEmpty())
                throw new IOException(footnoteList.size() + " / " + footnotePrefixes.size());
        }
    }
    anhang.getChapters().get(0).getProlog().trimWhitespace();
    anhang.getChapters().get(0).getProlog().finished();
    return bible;
}
Also used : Visitor(biblemulticonverter.data.FormattedText.Visitor) Matcher(java.util.regex.Matcher) Bible(biblemulticonverter.data.Bible) ArrayList(java.util.ArrayList) MetadataBook(biblemulticonverter.data.MetadataBook) Book(biblemulticonverter.data.Book) Headline(biblemulticonverter.data.FormattedText.Headline) MetadataBook(biblemulticonverter.data.MetadataBook) Pattern(java.util.regex.Pattern) Chapter(biblemulticonverter.data.Chapter) IOException(java.io.IOException) FormattedText(biblemulticonverter.data.FormattedText) Date(java.util.Date) BufferedReader(java.io.BufferedReader) File(java.io.File) SimpleDateFormat(java.text.SimpleDateFormat) Verse(biblemulticonverter.data.Verse)

Example 8 with Bible

use of biblemulticonverter.data.Bible in project BibleMultiConverter by schierlm.

the class OSIS method doImport.

@Override
public Bible doImport(File inputFile) throws Exception {
    ValidateXML.validateFileBeforeParsing(SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI).newSchema(ObjectFactory.class.getResource("/osisCore.2.1.1.xsd")), inputFile);
    printedWarnings.clear();
    DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
    XPath xpath = javax.xml.xpath.XPathFactory.newInstance().newXPath();
    Document osisDoc = docBuilder.parse(inputFile);
    String name = xpath.evaluate("/osis/osisText/header/work/title/text()", osisDoc);
    if (name.isEmpty())
        name = "OSIS Bible";
    Bible result = new Bible(name);
    String description = xpath.evaluate("/osis/osisText/header/work/description/text()", osisDoc);
    String rights = xpath.evaluate("/osis/osisText/header/work/rights/text()", osisDoc);
    if (!description.isEmpty() || !rights.isEmpty()) {
        String date = xpath.evaluate("/osis/osisText/header/work/date/text()", osisDoc);
        String titleDesc = xpath.evaluate("/osis/osisText/titlePage/description/text()", osisDoc);
        MetadataBook mb = new MetadataBook();
        if (!description.isEmpty())
            mb.setValue(MetadataBookKey.description, description.replaceAll("[\r\n\t ]+", " ").trim());
        if (!rights.isEmpty())
            mb.setValue(MetadataBookKey.rights, rights.replaceAll("[\r\n\t ]+", " ").trim());
        if (!date.isEmpty())
            mb.setValue(MetadataBookKey.date, date);
        if (!titleDesc.isEmpty())
            mb.setValue("description@titlePage", titleDesc.replaceAll("[\r\n\t ]+", " ").trim());
        mb.finished();
        result.getBooks().add(mb.getBook());
    }
    NodeList osisBooks = (NodeList) xpath.evaluate("/osis/osisText//div[@type='book']", osisDoc, XPathConstants.NODESET);
    for (int bookIndex = 0; bookIndex < osisBooks.getLength(); bookIndex++) {
        Element osisBook = (Element) osisBooks.item(bookIndex);
        if (!osisBook.getAttribute("sID").isEmpty()) {
            Element osisBookEnd = (Element) xpath.evaluate("//div[@eID='" + osisBook.getAttribute("sID") + "']", osisDoc, XPathConstants.NODE);
            if (osisBookEnd == null) {
                throw new IllegalStateException("No milestoned div found with eID " + osisBook.getAttribute("sID"));
            }
            if (!osisBookEnd.getParentNode().isSameNode(osisBook.getParentNode())) {
                List<Node> candidates = new ArrayList<>();
                Node commonParent = osisBookEnd;
                while (commonParent != null) {
                    candidates.add(commonParent);
                    commonParent = commonParent.getParentNode();
                }
                commonParent = osisBook;
                search: while (commonParent != null) {
                    for (Node candidate : candidates) {
                        if (commonParent.isSameNode(candidate)) {
                            break search;
                        }
                    }
                    commonParent = commonParent.getParentNode();
                }
                if (commonParent == null)
                    throw new IllegalStateException("Unable to find common parent of milestoned div start and end tag");
                convertToMilestoned((Element) commonParent);
                if (!osisBookEnd.getParentNode().isSameNode(osisBook.getParentNode())) {
                    throw new IllegalStateException("Unable to normalize XML so that milestoned div start and end tags are siblings");
                }
            }
            while (osisBook.getNextSibling() != null && !osisBook.getNextSibling().isSameNode(osisBookEnd)) {
                osisBook.appendChild(osisBook.getNextSibling());
            }
            osisBookEnd.getParentNode().removeChild(osisBookEnd);
        }
        String bookOsisID = osisBook.getAttribute("osisID");
        BookID bookID = BookID.fromOsisId(bookOsisID);
        String title = bookID.getEnglishName();
        Node titleElem = osisBook.getFirstChild();
        while (titleElem instanceof Text) titleElem = titleElem.getNextSibling();
        if (titleElem instanceof Element && titleElem.getNodeName().equals("title")) {
            Element titleElement = (Element) titleElem;
            if (titleElement.getAttribute("type").equals("main") && titleElement.getChildNodes().getLength() > 0)
                title = titleElement.getTextContent();
        }
        Book bibleBook = new Book(bookOsisID, bookID, title, title);
        result.getBooks().add(bibleBook);
        parseBook(bookOsisID, osisBook, bibleBook);
    }
    return result;
}
Also used : XPath(javax.xml.xpath.XPath) MetadataBook(biblemulticonverter.data.MetadataBook) Bible(biblemulticonverter.data.Bible) NodeList(org.w3c.dom.NodeList) Element(org.w3c.dom.Element) Node(org.w3c.dom.Node) ArrayList(java.util.ArrayList) Text(org.w3c.dom.Text) FormattedText(biblemulticonverter.data.FormattedText) Document(org.w3c.dom.Document) BookID(biblemulticonverter.data.BookID) DocumentBuilder(javax.xml.parsers.DocumentBuilder) MetadataBook(biblemulticonverter.data.MetadataBook) Book(biblemulticonverter.data.Book)

Example 9 with Bible

use of biblemulticonverter.data.Bible in project BibleMultiConverter by schierlm.

the class RoundtripHTML method doImport.

@Override
public Bible doImport(File inputDir) throws Exception {
    Bible bible;
    // metadata
    try (BufferedReader br = createReader(inputDir, "metadata.js")) {
        String line = br.readLine();
        br.readLine();
        bible = new Bible(line.substring(13, line.length() - 2).replace("\\\"", "\"").replace("\\\\", "\\"));
        Map<String, Object> fieldMap = new HashMap<String, Object>();
        while ((line = br.readLine()) != null) {
            if (line.startsWith("}")) {
                Book bk = new Book((String) fieldMap.get("abbr"), BookID.fromOsisId((String) fieldMap.get("osis")), (String) fieldMap.get("short"), (String) fieldMap.get("long"));
                for (int i = 0; i < (Integer) fieldMap.get("chapters"); i++) {
                    bk.getChapters().add(new Chapter());
                }
                bible.getBooks().add(bk);
                continue;
            }
            int pos = line.indexOf(":");
            String key = line.substring(0, pos);
            String value = line.substring(pos + 1);
            if (value.endsWith(","))
                value = value.substring(0, value.length() - 1);
            if (value.startsWith("\"") && value.endsWith("\"")) {
                fieldMap.put(key, value.substring(1, value.length() - 1).replace("\\\"", "\"").replace("\\\\", "\\"));
            } else if (value.equals("true") || value.equals("false")) {
                fieldMap.put(key, Boolean.parseBoolean(value));
            } else {
                fieldMap.put(key, Integer.parseInt(value));
            }
        }
    }
    // chapters
    for (Book bk : bible.getBooks()) {
        int cnumber = 0;
        for (Chapter ch : bk.getChapters()) {
            cnumber++;
            try (BufferedReader br = createReader(inputDir, getTypeDir(bk.getId()) + "/" + bk.getAbbr() + "_" + cnumber + ".html")) {
                String line;
                List<FormattedText.Visitor<RuntimeException>> footnotes = new ArrayList<>();
                while ((line = br.readLine()) != null) {
                    if (line.equals("<div class=\"biblehtmlcontent prolog\">")) {
                        line = br.readLine();
                        FormattedText prolog = new FormattedText();
                        int end = parseLine(prolog.getAppendVisitor(), line, 0, footnotes);
                        ch.setProlog(prolog);
                        if (end != line.length())
                            throw new IOException(line.substring(end));
                        line = br.readLine();
                        if (!line.equals("</div>"))
                            throw new IOException(line);
                    } else if (line.equals("<div class=\"biblehtmlcontent verses\" id=\"verses\">")) {
                        while ((line = br.readLine()) != null) {
                            if (line.equals("</div>"))
                                break;
                            if (!line.startsWith("<div class=\"v\" id=\"v") || !line.endsWith("</div>"))
                                throw new IOException(line);
                            line = line.substring(20, line.length() - 6);
                            int pos = line.indexOf("\">");
                            Verse v = new Verse(line.substring(0, pos));
                            int end = parseLine(v.getAppendVisitor(), line, pos + 2, footnotes);
                            if (end != line.length())
                                throw new IOException(line.substring(end));
                            ch.getVerses().add(v);
                        }
                        if (!line.equals("</div>"))
                            throw new IOException(line);
                    } else if (line.equals("<div class=\"biblehtmlcontent footnotes\">")) {
                        for (int i = 0; i < footnotes.size(); i++) {
                            line = br.readLine();
                            String prefix = "<div class=\"fn\"><sup class=\"fnt\"><a name=\"fn" + (i + 1) + "\" href=\"#fnm" + (i + 1) + "\">" + (i + 1) + "</a></sup> ";
                            if (!line.startsWith(prefix) || !line.endsWith("</div>"))
                                throw new IOException(line);
                            line = line.substring(prefix.length(), line.length() - 6);
                            int end = parseLine(footnotes.get(i), line, 0, null);
                            if (end != line.length())
                                throw new IOException(line.substring(end));
                        }
                        line = br.readLine();
                        if (!line.equals("</div>"))
                            throw new IOException(line);
                    }
                }
                if (ch.getProlog() != null)
                    ch.getProlog().finished();
                for (Verse v : ch.getVerses()) v.finished();
            }
        }
    }
    return bible;
}
Also used : Visitor(biblemulticonverter.data.FormattedText.Visitor) HashMap(java.util.HashMap) Bible(biblemulticonverter.data.Bible) Chapter(biblemulticonverter.data.Chapter) ArrayList(java.util.ArrayList) FormattedText(biblemulticonverter.data.FormattedText) IOException(java.io.IOException) Book(biblemulticonverter.data.Book) BufferedReader(java.io.BufferedReader) Verse(biblemulticonverter.data.Verse)

Example 10 with Bible

use of biblemulticonverter.data.Bible in project BibleMultiConverter by schierlm.

the class RoundtripXML method parseBible.

protected Bible parseBible(JAXBElement<BibleType> sBible) throws Exception {
    Bible dBible = new Bible(sBible.getValue().getName());
    for (BibleType.Book sBook : sBible.getValue().getBook()) {
        Book dBook = new Book(sBook.getAbbr(), BookID.fromOsisId(sBook.getId().replaceAll("-[0-9]+$", "")), sBook.getShortName(), sBook.getLongName());
        dBible.getBooks().add(dBook);
        for (BibleType.Book.Chapter sChapter : sBook.getChapter()) {
            Chapter dChapter = new Chapter();
            dBook.getChapters().add(dChapter);
            if (sChapter.getProlog() != null) {
                dChapter.setProlog(new FormattedText());
                parseContent(dChapter.getProlog().getAppendVisitor(), sChapter.getProlog().getContent());
                dChapter.getProlog().finished();
            }
            for (BibleType.Book.Chapter.Verse sVerse : sChapter.getVerse()) {
                Verse dVerse = new Verse(sVerse.getNumber());
                dChapter.getVerses().add(dVerse);
                parseContent(dVerse.getAppendVisitor(), sVerse.getContent());
                dVerse.finished();
            }
        }
    }
    return dBible;
}
Also used : Book(biblemulticonverter.data.Book) Bible(biblemulticonverter.data.Bible) Chapter(biblemulticonverter.data.Chapter) BibleType(biblemulticonverter.schema.roundtripxml.BibleType) FormattedText(biblemulticonverter.data.FormattedText) Verse(biblemulticonverter.data.Verse)

Aggregations

Bible (biblemulticonverter.data.Bible)20 Book (biblemulticonverter.data.Book)18 Chapter (biblemulticonverter.data.Chapter)18 FormattedText (biblemulticonverter.data.FormattedText)15 Verse (biblemulticonverter.data.Verse)15 BookID (biblemulticonverter.data.BookID)12 MetadataBook (biblemulticonverter.data.MetadataBook)10 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)9 Headline (biblemulticonverter.data.FormattedText.Headline)6 VirtualVerse (biblemulticonverter.data.VirtualVerse)6 File (java.io.File)5 EnumMap (java.util.EnumMap)5 HashMap (java.util.HashMap)5 Matcher (java.util.regex.Matcher)5 BufferedReader (java.io.BufferedReader)4 Visitor (biblemulticonverter.data.FormattedText.Visitor)3 HashSet (java.util.HashSet)3 BIBLEBOOK (biblemulticonverter.schema.zef2005.BIBLEBOOK)2 CAPTION (biblemulticonverter.schema.zef2005.CAPTION)2