Search in sources :

Example 1 with TableCellStart

use of biblemulticonverter.format.paratext.ParatextBook.TableCellStart in project BibleMultiConverter by schierlm.

the class USFM method doImportBook.

private ParatextBook doImportBook(File inputFile, Charset charset) throws Exception {
    KNOWN_CHARACTER_TAGS.addAll(AUTO_CLOSING_TAGS.keySet());
    if (!inputFile.getName().toLowerCase().endsWith(".usfm") && !inputFile.getName().toLowerCase().endsWith(".sfm"))
        return null;
    String data = new String(Files.readAllBytes(inputFile.toPath()), charset).replaceAll("[\\p{Cc}\\p{Z}]+", " ").trim() + "\\$EOF$";
    if (!data.startsWith("\\id ")) {
        System.out.println("WARNING: Skipping malformed file " + inputFile);
        return null;
    }
    int startPos = data.indexOf("\\", 2);
    int finalPos = data.length() - "\\$EOF$".length();
    String[] idParts = data.substring(4, startPos).trim().split(" ", 2);
    ParatextID id = ParatextID.fromIdentifier(idParts[0].toUpperCase());
    if (id == null) {
        System.out.println("WARNING: Skipping book with unknown ID: " + idParts[0]);
        return null;
    }
    ParatextBook result = new ParatextBook(id, idParts.length == 1 ? "" : idParts[1]);
    List<ParatextCharacterContentContainer> containerStack = new ArrayList<>();
    boolean ignoreAutoClosingTags = Boolean.getBoolean("biblemulticonverter.usfm.ignoreautoclosingtags");
    while (startPos < finalPos) {
        if (data.charAt(startPos) != '\\')
            throw new IllegalStateException();
        int pos = data.indexOf('\\', startPos + 1);
        String textPart = data.substring(startPos + 1, pos);
        startPos = pos;
        pos = Math.min(textPart.length(), 1 + Math.min((textPart + " ").indexOf(' '), (textPart + "*").indexOf('*')));
        String tag = textPart.substring(0, pos).trim().toLowerCase();
        textPart = textPart.substring(pos);
        if (textPart.endsWith(" ")) {
            String nextTag = data.substring(startPos + 1, Math.min(data.length(), startPos + 10)) + " *\\";
            pos = Math.min(nextTag.indexOf('\\'), Math.min(nextTag.indexOf(' '), nextTag.indexOf('*')));
            if (!KNOWN_CHARACTER_TAGS.contains(nextTag.substring(0, pos))) {
                textPart = textPart.substring(0, textPart.length() - 1);
            }
        }
        if (containerStack.isEmpty() && (AUTO_CLOSING_TAGS.containsKey(tag) || tag.equals("v") || FOOTNOTE_XREF_TAGS.containsKey(tag))) {
            ParatextCharacterContent container = new ParatextCharacterContent();
            result.getContent().add(container);
            containerStack.add(container);
        }
        boolean closeCharacterAttributes = false;
        if (PARAGRAPH_TAGS.containsKey(tag)) {
            result.getContent().add(new ParagraphStart(PARAGRAPH_TAGS.get(tag)));
            closeCharacterAttributes = true;
        } else if (tag.endsWith("*")) {
            String rawTag = tag.substring(0, tag.length() - 1);
            while (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof AutoClosingFormatting) {
                AutoClosingFormatting acc = (AutoClosingFormatting) containerStack.get(containerStack.size() - 1);
                if (acc.getUsedTag().equals(rawTag))
                    break;
                containerStack.remove(containerStack.size() - 1);
            }
            boolean found = false;
            if (AUTO_CLOSING_TAGS.containsKey(rawTag)) {
                if (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof AutoClosingFormatting) {
                    AutoClosingFormatting acc = (AutoClosingFormatting) containerStack.get(containerStack.size() - 1);
                    found = acc.getUsedTag().equals(rawTag);
                }
            } else if (FOOTNOTE_XREF_TAGS.containsKey(rawTag)) {
                if (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof FootnoteXref) {
                    FootnoteXref fx = (FootnoteXref) containerStack.get(containerStack.size() - 1);
                    found = fx.getKind().getTag().equals(rawTag);
                }
            } else {
                System.out.println("WARNING: Skipping unknown end tag \\" + tag);
            }
            if (found) {
                containerStack.remove(containerStack.size() - 1);
            } else {
                System.out.println("WARNING: Skipping mismatched end tag \\" + tag);
            }
        } else if (AUTO_CLOSING_TAGS.containsKey(tag)) {
            if (!tag.startsWith("+") && !ignoreAutoClosingTags) {
                while (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof AutoClosingFormatting) {
                    containerStack.remove(containerStack.size() - 1);
                }
            }
            AutoClosingFormatting nextContainer = new AutoClosingFormatting(AUTO_CLOSING_TAGS.get(tag), tag.startsWith("+"));
            containerStack.get(containerStack.size() - 1).getContent().add(nextContainer);
            containerStack.add(nextContainer);
            if (nextContainer.getKind().getDefaultAttributes() != null && data.startsWith("\\" + tag + "*", startPos) && textPart.contains("|")) {
                String[] defaultAttributes = nextContainer.getKind().getDefaultAttributes();
                String[] parts = textPart.split("\\|");
                for (int i = 1; i < parts.length; i++) {
                    if (parts[i].contains("=")) {
                        String attList = parts[i];
                        while (attList.contains("=")) {
                            pos = attList.indexOf('=');
                            String key = attList.substring(0, pos).trim();
                            attList = attList.substring(pos + 1).trim();
                            if (attList.startsWith("\"")) {
                                pos = attList.indexOf('"', 1);
                                nextContainer.getAttributes().put(key, attList.substring(1, pos));
                                attList = attList.substring(pos + 1).trim();
                            } else {
                                nextContainer.getAttributes().put(key, attList);
                                attList = "";
                            }
                        }
                    } else if (i - 1 < defaultAttributes.length) {
                        nextContainer.getAttributes().put(defaultAttributes[i - 1], parts[i]);
                    }
                }
                textPart = parts[0];
                if (textPart.endsWith(" ")) {
                    textPart = textPart.substring(0, textPart.length() - 1);
                }
            }
        } else if (tag.equals("v")) {
            String[] parts = textPart.split(" ", 2);
            containerStack.get(containerStack.size() - 1).getContent().add(new VerseStart(parts[0]));
            textPart = parts.length == 1 ? "" : parts[1];
        } else if (tag.equals("c")) {
            String[] parts = textPart.split(" ", 2);
            result.getContent().add(new ChapterStart(Integer.parseInt(parts[0])));
            closeCharacterAttributes = true;
            textPart = parts.length == 1 ? "" : parts[1];
        } else if (tag.matches("t[hc]r?[0-9]+")) {
            result.getContent().add(new TableCellStart(tag));
            closeCharacterAttributes = true;
        } else if (FOOTNOTE_XREF_TAGS.containsKey(tag)) {
            String[] parts = textPart.split(" ", 2);
            FootnoteXref nextContainer = new FootnoteXref(FOOTNOTE_XREF_TAGS.get(tag), parts[0]);
            containerStack.get(containerStack.size() - 1).getContent().add(nextContainer);
            containerStack.add(nextContainer);
            textPart = parts.length == 1 ? "" : parts[1];
        } else if (tag.equals("id")) {
            System.out.println("WARNING: Skipping duplicate \\id tag");
            textPart = "";
        } else if (tag.equals("ide")) {
            Charset correctCharset;
            try {
                if (textPart.matches("[0-9]+ - .*")) {
                    int codepage = Integer.parseInt(textPart.replaceAll(" - .*", ""));
                    correctCharset = codepage == 65001 ? StandardCharsets.UTF_8 : Charset.forName("windows-" + codepage);
                } else {
                    correctCharset = Charset.forName(textPart);
                }
            } catch (UnsupportedCharsetException | IllegalCharsetNameException ex) {
                System.out.println("WARNING: Unknown charset " + textPart + " specified, falling back to ISO-8859-1");
                correctCharset = StandardCharsets.ISO_8859_1;
            }
            if (!correctCharset.equals(charset)) {
                if (!charset.equals(StandardCharsets.UTF_8)) {
                    throw new IOException("Two charsets specified: " + charset + " and " + correctCharset);
                }
                return doImportBook(inputFile, correctCharset);
            }
            textPart = "";
        } else if (ATTRIBUTE_TAGS.contains(tag)) {
            result.getAttributes().put(tag, textPart);
            textPart = "";
        } else {
            System.out.println("WARNING: Skipping unknown tag \\" + tag);
        }
        if (closeCharacterAttributes) {
            containerStack.clear();
        }
        if (!textPart.isEmpty()) {
            if (containerStack.isEmpty()) {
                ParatextCharacterContent container = new ParatextCharacterContent();
                containerStack.add(container);
                result.getContent().add(container);
            }
            textPart = textPart.replace(" // ", " ").replace("~", "\u00A0");
            containerStack.get(containerStack.size() - 1).getContent().add(new ParatextCharacterContent.Text(textPart));
        }
    }
    return result;
}
Also used : AutoClosingFormatting(biblemulticonverter.format.paratext.ParatextCharacterContent.AutoClosingFormatting) TableCellStart(biblemulticonverter.format.paratext.ParatextBook.TableCellStart) ArrayList(java.util.ArrayList) Charset(java.nio.charset.Charset) IOException(java.io.IOException) FootnoteXref(biblemulticonverter.format.paratext.ParatextCharacterContent.FootnoteXref) ChapterStart(biblemulticonverter.format.paratext.ParatextBook.ChapterStart) IllegalCharsetNameException(java.nio.charset.IllegalCharsetNameException) VerseStart(biblemulticonverter.format.paratext.ParatextCharacterContent.VerseStart) ParatextCharacterContentContainer(biblemulticonverter.format.paratext.ParatextBook.ParatextCharacterContentContainer) ParatextID(biblemulticonverter.format.paratext.ParatextBook.ParatextID) UnsupportedCharsetException(java.nio.charset.UnsupportedCharsetException) ParagraphStart(biblemulticonverter.format.paratext.ParatextBook.ParagraphStart)

Aggregations

ChapterStart (biblemulticonverter.format.paratext.ParatextBook.ChapterStart)1 ParagraphStart (biblemulticonverter.format.paratext.ParatextBook.ParagraphStart)1 ParatextCharacterContentContainer (biblemulticonverter.format.paratext.ParatextBook.ParatextCharacterContentContainer)1 ParatextID (biblemulticonverter.format.paratext.ParatextBook.ParatextID)1 TableCellStart (biblemulticonverter.format.paratext.ParatextBook.TableCellStart)1 AutoClosingFormatting (biblemulticonverter.format.paratext.ParatextCharacterContent.AutoClosingFormatting)1 FootnoteXref (biblemulticonverter.format.paratext.ParatextCharacterContent.FootnoteXref)1 VerseStart (biblemulticonverter.format.paratext.ParatextCharacterContent.VerseStart)1 IOException (java.io.IOException)1 Charset (java.nio.charset.Charset)1 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)1 UnsupportedCharsetException (java.nio.charset.UnsupportedCharsetException)1 ArrayList (java.util.ArrayList)1